From dd70db6cabc8bfb9bf12b1fe4db0a44414067597 Mon Sep 17 00:00:00 2001 From: Vince Reuter Date: Wed, 14 Jun 2017 10:28:45 -0400 Subject: [PATCH 01/94] Sample subclassing messaging and more information control in interactive test session --- looper/models.py | 11 +++++++---- tests/conftest.py | 12 +++++++++++- 2 files changed, 18 insertions(+), 5 deletions(-) diff --git a/looper/models.py b/looper/models.py index 78f76078..0f23a784 100644 --- a/looper/models.py +++ b/looper/models.py @@ -1091,11 +1091,10 @@ def alpha_cased(text, lower=False): def make_samples(self): """ - Create samples from annotation sheet (considering library), - and them to the project. + Create samples (considering library) from annotation sheet, + and add them to the project. """ - found_pipelines = False try: import pipelines # Use a pipelines package if installed. except ImportError: @@ -1109,20 +1108,24 @@ def make_samples(self): _LOGGER.debug( "Added {} pipelines path(s) to sys.path: {}". format(len(pipeline_dirpaths), pipeline_dirpaths)) + else: + _LOGGER.debug("No pipelines directories to add to import path") try: import pipelines except ImportError: - pass + found_pipelines = False else: found_pipelines = True else: found_pipelines = True if not found_pipelines: + _LOGGER.debug("Could not import pipelines") # Just return a basic Sample for each of the sheet's rows. def make_sample(data): return Sample(data) else: + _LOGGER.debug("Successfully imported pipelines") # Attempt creation of Sample subtype specific to protocol. # Get all pipelines package Sample subclasses. diff --git a/tests/conftest.py b/tests/conftest.py index 96cdf5f9..335093b3 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -211,7 +211,7 @@ def interactive(prj_lines=PROJECT_CONFIG_LINES, iface_lines=PIPELINE_INTERFACE_CONFIG_LINES, merge_table_lines = MERGE_TABLE_LINES, sample_annotation_lines=SAMPLE_ANNOTATION_LINES, - project_kwargs=None): + loglevel=logging.DEBUG, project_kwargs=None): """ Create Project and PipelineInterface instances from default or given data. @@ -227,9 +227,19 @@ def interactive(prj_lines=PROJECT_CONFIG_LINES, table file :param collections.Iterable[str] sample_annotation_lines: lines for a sample annotations file + :param str | int loglevel: level at which to attend to log messages :param dict project_kwargs: keyword arguments for Project constructor :return Project, PipelineInterface: one Project and one PipelineInterface, """ + + # Establish logging for interactive session + import logging, sys + h = logging.StreamHandler(sys.stdout) + h.setLevel(loglevel) + logging.root.setLevel(loglevel) + logging.root.addHandler(h) + + # TODO: don't work with tempfiles once ctors tolerate Iterable. dirpath = tempfile.mkdtemp() path_conf_file = _write_temp( From 29dc0e0afe8889f5a0fa4c94e18cac6880eeda8c Mon Sep 17 00:00:00 2001 From: Vince Reuter Date: Wed, 14 Jun 2017 14:06:06 -0400 Subject: [PATCH 02/94] amid pull-apart of pipelines Sample subclass search --- looper/models.py | 22 ++++- looper/utils.py | 7 +- tests/models/independent/test_Sample.py | 115 +++++++++++++++++++++--- 3 files changed, 126 insertions(+), 18 deletions(-) diff --git a/looper/models.py b/looper/models.py index 0f23a784..7eefa8a5 100644 --- a/looper/models.py +++ b/looper/models.py @@ -704,7 +704,7 @@ def parse_config_file(self, subproject=None): def _ensure_absolute(self, maybe_relpath): - _LOGGER.debug("Ensuring absolute path for '%s'", maybe_relpath) + _LOGGER.debug("Ensuring absolute: '%s'", maybe_relpath) if _os.path.isabs(maybe_relpath) or is_url(maybe_relpath): _LOGGER.debug("Already absolute") return maybe_relpath @@ -1094,7 +1094,22 @@ def make_samples(self): Create samples (considering library) from annotation sheet, and add them to the project. """ + create_sample = self.find_sample_subtypes() + for _, row in self.df.iterrows(): + self.samples.append(create_sample(row.dropna())) + + + def find_sample_subtypes(self): + """ + Determine how to create Sample instances. + Search modules for classes that extend Sample in order to find + those that are more specifically tailored to a particular + data or experiment type. + + :return function(Mapping | pd.core.series.Series) -> Sample: function + that takes input data and creates a Sample (or perhaps a subclass). + """ try: import pipelines # Use a pipelines package if installed. except ImportError: @@ -1145,8 +1160,7 @@ def make_sample(data): except (AttributeError, KeyError): return Sample(data) - for _, row in self.df.iterrows(): - self.samples.append(make_sample(row.dropna())) + return make_sample def as_data_frame(self): @@ -1199,7 +1213,7 @@ class Sample(object): Class to model Samples based on a pandas Series. :param series: Sample's data. - :type series: pandas.core.series.Series + :type series: Mapping | pandas.core.series.Series :Example: diff --git a/looper/utils.py b/looper/utils.py index d092dc88..e65eb841 100644 --- a/looper/utils.py +++ b/looper/utils.py @@ -33,10 +33,11 @@ def fetch_package_classes(pkg, predicate=None): """ import inspect import itertools + modules = [pkg] if inspect.ismodule(pkg) else \ + [obj for obj in inspect.getmembers( + pkg, lambda member: inspect.ismodule(member))] return list(itertools.chain( - *[inspect.getmembers(mod, predicate) - for mod in inspect.getmembers( - pkg, lambda obj: inspect.ismodule(obj))])) + *[inspect.getmembers(mod, predicate) for mod in modules])) diff --git a/tests/models/independent/test_Sample.py b/tests/models/independent/test_Sample.py index 6532ee38..aa456bde 100644 --- a/tests/models/independent/test_Sample.py +++ b/tests/models/independent/test_Sample.py @@ -1,12 +1,13 @@ """ Tests for the Sample. """ import os +import tempfile import mock import numpy as np from pandas import Series import pytest import looper -from looper.models import Sample +from looper.models import Sample, SAMPLE_NAME_COLNAME __author__ = "Vince Reuter" @@ -14,17 +15,24 @@ +def pytest_generate_tests(metafunc): + """ Customization of this module's test cases. """ + if metafunc.cls == CustomSampleTests and \ + "subclass_attrname" in metafunc.fixturenames: + metafunc.parametrize(argnames="subclass_attrname", + argvalues=["library", "protocol"]) + + + class ParseSampleImplicationsTests: """ Tests for appending columns/fields to a Sample based on a mapping. """ - IMPLIER_NAME = "sample_name" + IMPLIER_NAME = SAMPLE_NAME_COLNAME IMPLIER_VALUES = ["a", "b"] SAMPLE_A_IMPLICATIONS = {"genome": "hg38", "phenome": "hg72"} SAMPLE_B_IMPLICATIONS = {"genome": "hg38"} IMPLICATIONS = [SAMPLE_A_IMPLICATIONS, SAMPLE_B_IMPLICATIONS] - IMPLICATIONS_MAP = { - IMPLIER_NAME: IMPLICATIONS - } + IMPLICATIONS_MAP = {IMPLIER_NAME: IMPLICATIONS} def test_project_lacks_implications(self, sample): @@ -128,7 +136,7 @@ def sample(self, request): data = request.getfixturevalue("data") else: data = {} - data.setdefault("sample_name", "test-sample") + data.setdefault(SAMPLE_NAME_COLNAME, "test-sample") # Mock the validation and return a new Sample. rubber_stamper = mock.MagicMock(return_value=[]) @@ -150,17 +158,101 @@ class SampleRequirementsTests: ids=lambda has_name: "has_name: {}".format(has_name)) def test_requires_sample_name(self, has_name, data_type): data = {} - sample_name_key = "sample_name" sample_name = "test-sample" if has_name: - data[sample_name_key] = sample_name + data[SAMPLE_NAME_COLNAME] = sample_name sample = Sample(data_type(data)) - assert sample_name == getattr(sample, sample_name_key) + assert sample_name == getattr(sample, SAMPLE_NAME_COLNAME) else: with pytest.raises(ValueError): Sample(data_type(data)) +from looper.models import Sample +class DummySampleSubclass(Sample): + """ Subclass shell to test ability of Project to find Sample subclass. """ + __library__ = "arbitrary" + pass + + + +class CustomSampleTests: + """ Bespoke Sample creation tests. """ + + + PROTOCOLS = ["WGBS", "RRBS", "ATAC-Seq", "RNA-seq"] + + + @pytest.mark.fixture(scope="function") + def sample_subclass_definition(self, tmpdir, request): + subclass_attrname = request.getfixturevalue("subclass_attrname") + pipelines_type = request.getfixturevalue("pipelines_type") + if "pipe_path" in request.fixturenames: + pipe_path = tmpdir.strpath + else: + pipe_path = request.getfixturevalue("pipe_path") + if pipelines_type == "module": + pipe_path = os.path.join(pipe_path, "pipelines.py") + elif pipelines_type == "package": + init_file = os.path.join(pipe_path, "__init__.py") + with open(init_file, 'w') as f: + pass + module_file = tempfile.NamedTemporaryFile(dir=pipe_path, suffix=".py", delete=False) + with open(module_file, 'w') as modfile: + # TODO: write out definition. + pass + else: + raise ValueError( + "Unknown pipelines type: {}; module and package " + "are supported".format(pipelines_type)) + + # TODO: ensure cleanup. + request.addfinalizer() + + + DATA_FOR_SAMPLES = { + SAMPLE_NAME_COLNAME: ["sample{}".format(i) for i in range(3)], + "arbitrary-value": list(np.random.randint(-1000, 1000, size=3))} + + + CLASS_DEFINITION_LINES = """\"\"\" Sample subclass test file. \"\"\" + + from looper.models import Sample + + class DummySampleSubclass(Sample): + \"\"\" Subclass shell to test Project's Sample subclass seek sensitivity. \"\"\" + __{attribute_name}__ = {attribute_value} + pass + + class NotSampleSubclass(Sample): + \"\"\" Subclass shell to test Project's Sample subclass seek specificity. \"\"\" + __unrecognized__ = irrelevant + + """ + + + def test_generic_sample_for_unfindable_subclass(self): + """ If no Sample subclass is found, a generic Sample is created. """ + pass + + + @pytest.mark.parametrize( + argnames="pipelines_type", argvalues=["module", "package"]) + def test_raw_pipelines_import_has_sample_subclass(self, subclass_attrname): + """ Project finds Sample subclass in pipelines package. """ + pass + + + def test_project_pipelines_dir_has_sample_subclass(self, subclass_attrname): + """ Project finds Sample subclass in optional pipelines_dir. """ + pass + + + def test_sample_subclass_messaging(self, subclass_attrname): + """ Sample subclass seek process provides info about procedure. """ + pass + + @pytest.mark.parametrize( argnames="accessor", argvalues=["attr", "item"], @@ -168,7 +260,7 @@ def test_requires_sample_name(self, has_name, data_type): @pytest.mark.parametrize(argnames="data_type", argvalues=[dict, Series]) def test_exception_type_matches_access_mode(data_type, accessor): """ Exception for attribute access failure reflects access mode. """ - data = {"sample_name": "placeholder"} + data = {SAMPLE_NAME_COLNAME: "placeholder"} sample = Sample(data_type(data)) if accessor == "attr": with pytest.raises(AttributeError): @@ -191,6 +283,7 @@ def test_exception_type_matches_access_mode(data_type, accessor): argnames="preexists", argvalues=[False, True], ids=lambda exists: "preexists={}".format(exists)) def test_make_sample_dirs(paths, preexists, tmpdir): + """ Existence guarantee Sample instance's folders is safe and valid. """ # Derive full paths and assure nonexistence before creation. fullpaths = [] @@ -202,7 +295,7 @@ def test_make_sample_dirs(paths, preexists, tmpdir): fullpaths.append(fullpath) # Make the sample and assure paths preexistence. - s = Sample({"sample_name": "placeholder"}) + s = Sample({SAMPLE_NAME_COLNAME: "placeholder"}) s.paths = fullpaths # Base the test's initial condition on the parameterization. From b9923a92270f037094ff53a426fc19e181a45bc0 Mon Sep 17 00:00:00 2001 From: Vince Reuter Date: Wed, 14 Jun 2017 17:43:17 -0400 Subject: [PATCH 03/94] remove old examples --- examples/microtest_merge_table.csv | 4 - examples/microtest_project_config.yaml | 94 ------------------------ examples/microtest_sample_annotation.csv | 16 ---- 3 files changed, 114 deletions(-) delete mode 100644 examples/microtest_merge_table.csv delete mode 100644 examples/microtest_project_config.yaml delete mode 100644 examples/microtest_sample_annotation.csv diff --git a/examples/microtest_merge_table.csv b/examples/microtest_merge_table.csv deleted file mode 100644 index 2bcf237f..00000000 --- a/examples/microtest_merge_table.csv +++ /dev/null @@ -1,4 +0,0 @@ -sample_name,data_source,file_number -rrbs,microtest_merge,1 -wgbs,microtest_merge,1 -wgbs,microtest_merge,2 diff --git a/examples/microtest_project_config.yaml b/examples/microtest_project_config.yaml deleted file mode 100644 index 1109d790..00000000 --- a/examples/microtest_project_config.yaml +++ /dev/null @@ -1,94 +0,0 @@ -# This project config file describes all *project-specific variables* -# Its primary purpose as as input to Looper, which will submit jobs as appropriate -# for each sample in the project. -# But it is also read by other tools, including: -# - project sample loop (primary purpose) -# - make_trackhubs scripts to produce web accessible results -# - stats summary scripts -# - analysis scripts requiring pointers to metadata, results, and other options. - -metadata: - # output_dir: ABSOLUTE PATH to the parent, shared space where project results go - output_dir: /scratch/lab_bock/shared/projects/microtest - # results and submission subdirs are subdirectors directories under parent output_dir - # results: where output sample folders will go - # submission: where cluster submit scripts and log files will go - results_subdir: results_pipeline - submission_subdir: submission - # pipelines_dir: ABSOLUTE PATH the directory where the Looper will find pipeline - # scripts (and accompanying pipeline config files) for submission. - pipelines_dir: $CODEBASE/pipelines - # Elements in this section can be absolute or relative. - # Typically, this project config file is stored with the project metadata, so - # relative paths are considered relative to this project config file. - # sample_annotation: one-row-per-sample metadata - sample_annotation: microtest_sample_annotation.csv - # merge_table: input for samples with more than one input file - merge_table: microtest_merge_table.csv - # compare_table: comparison pairs or groups, like normalization samples - compare_table: null.csv - - -# a list of annotation sheet columns that are "derived" -# the values in these are constructed using a regex-like expression -# of variables (defined in the next section). -derived_columns: [data_source] - - -data_sources: - # specify the ABSOLUTE PATH of input files using variable path expressions - # entries correspond to values in the data_source column in sample_annotation table - # {variable} can be used to replace environment variables or other sample_annotation columns - # If you use {variable} codes, you should quote the field so python can parse it. - bsf_samples: "{RAWDATA}{flowcell}/{flowcell}_{lane}_samples/{flowcell}_{lane}#{BSF_name}.bam" - microtest: "/data/groups/lab_bock/shared/resources/microtest/{sample_name}.bam" - microtest_merge: "/data/groups/lab_bock/shared/resources/microtest/{sample_name}{file_number}.bam" - - -subprojects: - config_test: - pipeline_config: - wgbs.py: wgbs_ds.yaml - - -genomes: - human: hg19 - mouse: mm10 - -transcriptomes: - human: hg19_cdna - mouse: mm10_cdna - - -pipeline_config: - # pipeline configuration files used in project. - # Key string must match the _name of the pipeline script_ (including extension) - # Relative paths are relative to this project config file. - # Default (null) means use the generic config for the pipeline. - # wgbs.py: null - # Or you can point to a specific config to be used in this project: - # rrbs.py: rrbs_config.yaml - # wgbs.py: wgbs_config.yaml - # cgps: cpgs_config.yaml - - -pipeline_args: - rnaBitSeq.py: - "-w": 50 - - -trackhubs: - trackhub_dir: /data/groups/lab_bock/public_html/arendeiro/microtest/ - # url: if you include this, the make_trackhubs will produce a link to your track hub in the project folder. - url: http://www.whatever.com/ - matrix_x: cell_type - matrix_y: cell_count - sort_order: cell_type=+ - parent_track_name: ews_rrbs - visibility: dense - hub_name: ews_hub - short_label_column: sample_name - email: arendeiro@cemm.oeaw.ac.at - -username: user -email: user@email.com diff --git a/examples/microtest_sample_annotation.csv b/examples/microtest_sample_annotation.csv deleted file mode 100644 index bc9b1b49..00000000 --- a/examples/microtest_sample_annotation.csv +++ /dev/null @@ -1,16 +0,0 @@ -sample_name,library,organism,ip,data_source -atac-seq_PE,ATAC-seq,human,,microtest -atac-seq_SE,ATAC-seq,human,,microtest -chip-seq_PE,CHIP-seq,human,H3K27ac,microtest -chip-seq_SE,CHIP-seq,human,H3K27ac,microtest -chipmentation_PE,ChIPmentation,human,H3K27ac,microtest -chipmentation_SE,ChIPmentation,human,H3K27ac,microtest -cpgseq_example_data,CpG-seq,human,,microtest -quant-seq_SE,Quant-seq,human,,microtest -rrbs,RRBS,human,,microtest -rrbs_PE,RRBS,human,,microtest -wgbs,WGBS,human,,microtest -RNA_TRUseq_50SE,SMART,human,,microtest -RNA_SMART_50SE,SMART,human,,microtest -rrbs_PE_fq,RRBS,human,,microtest -rrbs_fq,RRBS,human,,microtest From 49f1e5f138f3a67d7f528025d6b276b5746b5f45 Mon Sep 17 00:00:00 2001 From: Vince Reuter Date: Wed, 14 Jun 2017 18:42:32 -0400 Subject: [PATCH 04/94] better failure for nonexistent pipelines location; pull apart search for Sample subclasses; clean up messaging and comments --- looper/__init__.py | 8 +++++-- looper/looper.py | 35 ++++++++++++++++----------- looper/models.py | 59 +++++++++++++++++++++++++++++++++++++++------- 3 files changed, 77 insertions(+), 25 deletions(-) diff --git a/looper/__init__.py b/looper/__init__.py index 19e32a81..11dcf312 100644 --- a/looper/__init__.py +++ b/looper/__init__.py @@ -25,8 +25,12 @@ def setup_looper_logger(level, additional_locations=None, devmode=False): """ - Called by test configuration via `pytest`'s `conftest`. - All arguments are optional and have suitable defaults. + Establish a logger for a looper CLI program. + + This configures a logger to provide information about a looper program's + execution. Verbosity, destination(s) for messages, and message text + format are controlled by the arguments' values. This is also used by + looper's test suite. :param int | str level: logging level :param tuple(str | FileIO[str]) additional_locations: supplementary diff --git a/looper/looper.py b/looper/looper.py index d676d17f..4d18d414 100755 --- a/looper/looper.py +++ b/looper/looper.py @@ -215,12 +215,12 @@ def run(prj, args, remaining_args, interface_manager): # Don't submit samples with duplicate names. if sample.sample_name in processed_samples: - skip_reasons.append("Duplicate sample name.") + skip_reasons.append("Duplicate sample name") # Check if sample should be run. if hasattr(sample, SAMPLE_EXECUTION_TOGGLE): if sample[SAMPLE_EXECUTION_TOGGLE] != "1": - skip_reasons.append("Column '{}' deselected.".format(SAMPLE_EXECUTION_TOGGLE)) + skip_reasons.append("Column '{}' deselected".format(SAMPLE_EXECUTION_TOGGLE)) # Check if single_or_paired value is recognized. if hasattr(sample, _read_type): @@ -228,7 +228,7 @@ def run(prj, args, remaining_args, interface_manager): sample.read_type = re.sub( '[_\\-]?end$', '', str(sample.read_type)).lower() if sample.read_type not in valid_read_types: - skip_reasons.append("{} must be in {}.".\ + skip_reasons.append("{} must be in {}".\ format(_read_type, valid_read_types)) # Get the base protocol-to-pipeline mappings @@ -237,9 +237,9 @@ def run(prj, args, remaining_args, interface_manager): pipelines = interface_manager.build_pipelines(protocol) if len(pipelines) == 0: skip_reasons.append( - "No pipeline found for protocol {}.".format(protocol)) + "No pipeline found for protocol {}".format(protocol)) else: - skip_reasons.append("Missing '{}' attribute.".format(_protocol)) + skip_reasons.append("Missing '{}' attribute".format(_protocol)) if skip_reasons: @@ -270,7 +270,7 @@ def run(prj, args, remaining_args, interface_manager): pipeline_interface, pipeline_name=pipeline_key) except AttributeError: # TODO: inform about WHICH missing attribute(s). - fail_message = "Pipeline required attribute(s) missing." + fail_message = "Pipeline required attribute(s) missing" _LOGGER.warn("> Not submitted: %s", fail_message) skip_reasons.append(fail_message) @@ -280,7 +280,7 @@ def run(prj, args, remaining_args, interface_manager): sample.confirm_required_inputs() except IOError: # TODO: inform about WHICH missing file(s). - fail_message = "Required input file(s) not found." + fail_message = "Required input file(s) not found" _LOGGER.warn("> Not submitted: %s", fail_message) skip_reasons.append(fail_message) @@ -308,7 +308,7 @@ def run(prj, args, remaining_args, interface_manager): except AttributeError: # TODO: inform about which missing attribute(s). fail_message = "Required attribute(s) missing " \ - "for pipeline arguments string." + "for pipeline arguments string" _LOGGER.warn("> Not submitted: %s", fail_message) skip_reasons.append(fail_message) @@ -384,13 +384,13 @@ def run(prj, args, remaining_args, interface_manager): if failures: _LOGGER.info("%d sample(s) with submission failure.", len(failures)) - sample_count_pairs_by_reason = aggregate_exec_skip_reasons(failures) + sample_by_reason = aggregate_exec_skip_reasons(failures) _LOGGER.info("{} unique reasons for submission failure: {}".format( - len(sample_count_pairs_by_reason), - sample_count_pairs_by_reason.keys())) + len(sample_by_reason), + ", ".join(sample_by_reason.keys()))) _LOGGER.info("Per-sample submission failure count for each reason:") - for reason, sample_nfail_pairs in sample_count_pairs_by_reason.items(): - _LOGGER.info("> {}: {}".format(reason, sample_nfail_pairs)) + for reason, samples in sample_by_reason.items(): + _LOGGER.info("{}: {}".format(reason, samples)) @@ -792,6 +792,8 @@ def main(): args.config_file, args.subproject, file_checks=args.file_checks, compute_env_file=getattr(args, 'env', None)) + prj.add_sample_sheet() + prj.finalize_pipelines_directory() _LOGGER.info("Results subdir: " + prj.metadata.results_subdir) @@ -810,9 +812,14 @@ def main(): if len(pipedirs) == 0: _LOGGER.error("Looper requires a metadata.pipelines_dir") - raise AttributeError + raise AttributeError("Project metadata has an empty " + "collection of pipeline locations.") interface_manager = InterfaceManager(prj.metadata.pipelines_dir) + if not interface_manager.ifproto_by_proto_name: + _LOGGER.error("Empty interface manager. Does your project point " + "at least one pipelines location that exists?") + return try: run(prj, args, remaining_args, interface_manager=interface_manager) except IOError: diff --git a/looper/models.py b/looper/models.py index 7eefa8a5..3f65ac91 100644 --- a/looper/models.py +++ b/looper/models.py @@ -440,9 +440,6 @@ def __init__(self, config_file, subproject=None, # and adds default derived columns. self.sheet = None self.samples = list() - self.add_sample_sheet() - - self.finalize_pipelines_directory() @property @@ -874,7 +871,7 @@ def create_argtext(name): return pipeline_argtext - def add_sample_sheet(self, csv=None): + def add_sample_sheet(self, csv=None, sample_builder=None): """ Build a `SampleSheet` object from a csv file and add it and its samples to the project. @@ -894,7 +891,7 @@ def add_sample_sheet(self, csv=None): # Generate sample objects from annotation sheet. _LOGGER.debug("Creating samples from annotation sheet") - self.sheet.make_samples() + self.sheet.make_samples(sample_builder) # Add samples to Project for sample in self.sheet.samples: @@ -1089,17 +1086,17 @@ def alpha_cased(text, lower=False): return text.lower() if lower else text.upper() - def make_samples(self): + def make_samples(self, sample_builder=None): """ Create samples (considering library) from annotation sheet, and add them to the project. """ - create_sample = self.find_sample_subtypes() + create_sample = sample_builder or self._find_sample_subtypes() for _, row in self.df.iterrows(): self.samples.append(create_sample(row.dropna())) - def find_sample_subtypes(self): + def _find_sample_subtypes(self): """ Determine how to create Sample instances. @@ -1163,6 +1160,46 @@ def make_sample(data): return make_sample + + def protocol_to_subclass(self): + try: + import pipelines # Use a pipelines package if installed. + except ImportError: + # pipelines_dir is optional. + pipeline_dirpaths = getattr( + self.prj.metadata, "pipelines_dir", None) + + if not pipeline_dirpaths: + _LOGGER.debug("No pipelines directories to add to import path") + return None + + if isinstance(pipeline_dirpaths, str): + pipeline_dirpaths = [pipeline_dirpaths] + sys.path.extend(pipeline_dirpaths) + _LOGGER.debug( + "Added {} pipelines path(s) to sys.path: {}". + format(len(pipeline_dirpaths), pipeline_dirpaths)) + + try: + import pipelines + except ImportError: + _LOGGER.debug("Could not import pipelines") + return None + + _LOGGER.debug("Successfully imported pipelines") + + # Get all pipelines package Sample subclasses. + import inspect + from utils import fetch_package_classes + sample_types = fetch_package_classes(pipelines, + lambda maybe_class: inspect.isclass(maybe_class) + and issubclass(maybe_class, Sample)) + + # TODO: perhaps modify or alter handling of need for __library__. + return {self.alpha_cased(sample_class.__library__): sample_class + for sample_type, sample_class in sample_types} + + def as_data_frame(self): """ Returns a `pandas.DataFrame` representation of self. @@ -2058,7 +2095,8 @@ class InterfaceManager(object): def __init__(self, pipeline_dirs): # Collect interface/mappings pairs by protocol name. interfaces_and_protocols = \ - [ProtocolInterfaces(pipedir) for pipedir in pipeline_dirs] + [ProtocolInterfaces(pipedir) for pipedir in pipeline_dirs + if _os.path.exists(pipedir)] self.ifproto_by_proto_name = defaultdict(list) for ifproto in interfaces_and_protocols: for proto_name in ifproto.protomap: @@ -2187,6 +2225,9 @@ def __init__(self, pipedir): except Exception as e: _LOGGER.error(str(iface)) raise e + else: + raise ValueError("Alleged pipelines location '{}' exists neither " + "as a file nor as a folder.".format(pipedir)) def pipeline_key_to_path(self, pipeline_key): From 500b8a92af37ac6cccff99e5862454675f5b0f5f Mon Sep 17 00:00:00 2001 From: Vince Reuter Date: Wed, 14 Jun 2017 18:45:30 -0400 Subject: [PATCH 05/94] better message --- looper/looper.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/looper/looper.py b/looper/looper.py index 4d18d414..ad613216 100755 --- a/looper/looper.py +++ b/looper/looper.py @@ -387,10 +387,9 @@ def run(prj, args, remaining_args, interface_manager): sample_by_reason = aggregate_exec_skip_reasons(failures) _LOGGER.info("{} unique reasons for submission failure: {}".format( len(sample_by_reason), - ", ".join(sample_by_reason.keys()))) - _LOGGER.info("Per-sample submission failure count for each reason:") - for reason, samples in sample_by_reason.items(): - _LOGGER.info("{}: {}".format(reason, samples)) + list(sample_by_reason.keys()))) + _LOGGER.info("Per-sample submission failure count for " + "each reason: {}".format(sample_by_reason)) From 0f77aee0ee32fa9acd71f4432be28683956af11f Mon Sep 17 00:00:00 2001 From: Vince Reuter Date: Wed, 14 Jun 2017 19:20:12 -0400 Subject: [PATCH 06/94] simplify edge case handling in looper run; pause Sample subclass tests --- looper/looper.py | 21 ++++++--------- tests/models/independent/test_Sample.py | 34 ++++++++++++------------- 2 files changed, 25 insertions(+), 30 deletions(-) diff --git a/looper/looper.py b/looper/looper.py index ad613216..f8ed7d8a 100755 --- a/looper/looper.py +++ b/looper/looper.py @@ -403,7 +403,7 @@ def aggregate_exec_skip_reasons(skip_reasons_sample_pairs): :return Mapping[str, Iterable[str]]: mapping from explanation to collection of names of samples to which it pertains """ - from collections import Counter, defaultdict + from collections import defaultdict samples_by_skip_reason = defaultdict(list) for skip_reasons, sample in skip_reasons_sample_pairs: for reason in set(skip_reasons): @@ -802,22 +802,17 @@ def main(): # TODO split here, spawning separate run process for each # pipelines directory in project metadata pipelines directory. - try: - pipedirs = prj.metadata.pipelines_dir - _LOGGER.info("Pipelines path(s): {}".format(pipedirs)) - except AttributeError: - _LOGGER.error("Looper requires a metadata.pipelines_dir") - raise - if len(pipedirs) == 0: - _LOGGER.error("Looper requires a metadata.pipelines_dir") - raise AttributeError("Project metadata has an empty " - "collection of pipeline locations.") + if not hasattr(prj.metadata, "pipelines_dir") or \ + len(prj.metadata.pipelines_dir) == 0: + raise AttributeError( + "Looper requires at least one pipeline(s) location.") interface_manager = InterfaceManager(prj.metadata.pipelines_dir) if not interface_manager.ifproto_by_proto_name: - _LOGGER.error("Empty interface manager. Does your project point " - "at least one pipelines location that exists?") + _LOGGER.error( + "The interface manager is empty. Does your project point " + "to at least one pipelines location that exists?") return try: run(prj, args, remaining_args, interface_manager=interface_manager) diff --git a/tests/models/independent/test_Sample.py b/tests/models/independent/test_Sample.py index aa456bde..25602a88 100644 --- a/tests/models/independent/test_Sample.py +++ b/tests/models/independent/test_Sample.py @@ -17,10 +17,14 @@ def pytest_generate_tests(metafunc): """ Customization of this module's test cases. """ - if metafunc.cls == CustomSampleTests and \ - "subclass_attrname" in metafunc.fixturenames: - metafunc.parametrize(argnames="subclass_attrname", - argvalues=["library", "protocol"]) + if metafunc.cls == CustomSampleTests: + if "subclass_attrname" in metafunc.fixturenames: + metafunc.parametrize(argnames="subclass_attrname", + argvalues=["library", "protocol"]) + if "pipelines_type" in metafunc.fixturenames: + metafunc.parametrize(argnames="pipelines_type", + argvalues=["module", "package"]) + @@ -168,14 +172,8 @@ def test_requires_sample_name(self, has_name, data_type): Sample(data_type(data)) -from looper.models import Sample -class DummySampleSubclass(Sample): - """ Subclass shell to test ability of Project to find Sample subclass. """ - __library__ = "arbitrary" - pass - - +@pytest.mark.skip("Not implemented") class CustomSampleTests: """ Bespoke Sample creation tests. """ @@ -198,7 +196,8 @@ def sample_subclass_definition(self, tmpdir, request): with open(init_file, 'w') as f: pass module_file = tempfile.NamedTemporaryFile(dir=pipe_path, suffix=".py", delete=False) - with open(module_file, 'w') as modfile: + module_file.close() + with open(module_file.name, 'w') as modfile: # TODO: write out definition. pass else: @@ -236,19 +235,20 @@ def test_generic_sample_for_unfindable_subclass(self): pass - @pytest.mark.parametrize( - argnames="pipelines_type", argvalues=["module", "package"]) - def test_raw_pipelines_import_has_sample_subclass(self, subclass_attrname): + def test_raw_pipelines_import_has_sample_subclass( + self, pipelines_type, subclass_attrname): """ Project finds Sample subclass in pipelines package. """ pass - def test_project_pipelines_dir_has_sample_subclass(self, subclass_attrname): + def test_project_pipelines_dir_has_sample_subclass( + self, pipelines_type, subclass_attrname): """ Project finds Sample subclass in optional pipelines_dir. """ pass - def test_sample_subclass_messaging(self, subclass_attrname): + def test_sample_subclass_messaging( + self, pipelines_type, subclass_attrname): """ Sample subclass seek process provides info about procedure. """ pass From 322fe8d7c93971e4460892d6b0c61d525aa6d287 Mon Sep 17 00:00:00 2001 From: Vince Reuter Date: Thu, 15 Jun 2017 03:07:23 -0400 Subject: [PATCH 07/94] reflect separated project init in test fixture; update requiremnets; require setuptools --- requirements/requirements-all.txt | 2 +- setup.cfg | 6 +++--- setup.py | 26 ++++++++++++++++---------- tests/conftest.py | 5 ++++- 4 files changed, 24 insertions(+), 15 deletions(-) diff --git a/requirements/requirements-all.txt b/requirements/requirements-all.txt index aa459e2c..cc4117e1 100644 --- a/requirements/requirements-all.txt +++ b/requirements/requirements-all.txt @@ -1,3 +1,3 @@ colorama==0.3.9 -pandas==0.20.1 +pandas==0.20.2 pyyaml==3.12 diff --git a/setup.cfg b/setup.cfg index e13f2591..5d8fdac6 100644 --- a/setup.cfg +++ b/setup.cfg @@ -1,7 +1,7 @@ +[aliases] +test = pytest + [pytest] # Only request extra info from failures and errors. addopts = -rfE -[aliases] -test = pytest - diff --git a/setup.py b/setup.py index c54d33a9..f8ec0933 100644 --- a/setup.py +++ b/setup.py @@ -1,12 +1,14 @@ #! /usr/bin/env python import os +from setuptools import setup import sys # Additional keyword arguments for setup(). extra = {} +# Ordinary dependencies DEPENDENCIES = [] with open("requirements/requirements-all.txt", "r") as reqs_file: for line in reqs_file: @@ -14,14 +16,20 @@ continue DEPENDENCIES.append(line.split("=")[0].rstrip("<>")) +# numexpr for pandas try: - from setuptools import setup - if sys.version_info >= (3,): - extra["use_2to3"] = True - extra["install_requires"] = DEPENDENCIES + import numexpr except ImportError: - from distutils.core import setup - extra["requires"] = DEPENDENCIES + # No numexpr is OK for pandas. + pass +else: + # pandas 0.20.2 needs updated numexpr; the claim is 2.4.6, but that failed. + DEPENDENCIES.append("numexpr==2.6.2") + +# 2to3 +if sys.version_info >= (3, ): + extra["use_2to3"] = True +extra["install_requires"] = DEPENDENCIES # Additional files to include with package @@ -64,9 +72,7 @@ def get_static(name, condition=None): package_data={'looper': ['submit_templates/*']}, include_package_data=True, test_suite="tests", - tests_require=["mock", "pytest"], - setup_requires=(["pytest-runner"] - if {"ptr", "test", "pytest"} & set(sys.argv) - else []), + tests_require=(["mock", "pytest"]), + setup_requires=(["pytest-runner"] if {"test", "pytest", "ptr"} & set(sys.argv) else []), **extra ) diff --git a/tests/conftest.py b/tests/conftest.py index 335093b3..01135826 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -431,7 +431,10 @@ def proj(request): :return looper.models.Project: object created by parsing data in file pointed to by `request` class """ - return _create(request, Project) + p = _create(request, Project) + p.add_sample_sheet() + p.finalize_pipelines_directory() + return p From cb3a8381b363a44533d48b343a96e1740390ede1 Mon Sep 17 00:00:00 2001 From: Vince Reuter Date: Thu, 15 Jun 2017 03:34:07 -0400 Subject: [PATCH 08/94] type cast for config value for inequality comparison --- looper/looper.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/looper/looper.py b/looper/looper.py index f8ed7d8a..f83695d4 100755 --- a/looper/looper.py +++ b/looper/looper.py @@ -350,10 +350,10 @@ def run(prj, args, remaining_args, interface_manager): cmd += " -C " + pl_config_file cmd += " -O " + prj.metadata.results_subdir - if submit_settings.setdefault("cores", 1) > 1: + if int(submit_settings.setdefault("cores", 1)) > 1: cmd += " -P " + submit_settings["cores"] try: - if submit_settings["mem"] > 1: + if float(submit_settings["mem"]) > 1: cmd += " -M " + submit_settings["mem"] except KeyError: _LOGGER.warn("Submission settings " From a195362c79d142b56d4fc0c56eea3160f5c3958b Mon Sep 17 00:00:00 2001 From: Vince Reuter Date: Thu, 15 Jun 2017 03:45:12 -0400 Subject: [PATCH 09/94] reflect new requirements in changelog --- doc/source/changelog.rst | 2 ++ 1 file changed, 2 insertions(+) diff --git a/doc/source/changelog.rst b/doc/source/changelog.rst index 4ec190a3..04a2f053 100644 --- a/doc/source/changelog.rst +++ b/doc/source/changelog.rst @@ -29,6 +29,8 @@ Changelog - Various small bug fixes and dev improvements. + - Require `setuptools` for installation, and `pandas 0.20.2`. If `numexpr` is installed, + version `2.6.2` is required. - **v0.5** (*2017-03-01*): From 5a7eaca9a519802642194e937228c893a33c5b5e Mon Sep 17 00:00:00 2001 From: nsheff Date: Thu, 15 Jun 2017 10:32:44 -0400 Subject: [PATCH 10/94] version bump to rc2 --- looper/_version.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/looper/_version.py b/looper/_version.py index 29c83aa3..76d80dda 100644 --- a/looper/_version.py +++ b/looper/_version.py @@ -1 +1 @@ -__version__ = "0.6.0-rc1" +__version__ = "0.6.0-rc2" From 1f26297bc1503d0f86e0440eef2c74bb13ab731c Mon Sep 17 00:00:00 2001 From: nsheff Date: Thu, 15 Jun 2017 10:36:06 -0400 Subject: [PATCH 11/94] Remove scripts directory This directory contains old relics of previous ways of doing things. These scripts have moved to the internal scripts repo in case they're needed later --- scripts/cleanFailed.sh | 10 - scripts/convertBismarkReport.R | 84 ---- scripts/fastqcSummary.py | 156 ------- scripts/flagCheck.sh | 23 - scripts/make_SummaryTable.py | 319 ------------- scripts/make_trackhubs.py | 543 ----------------------- scripts/normalize_wig.R | 67 --- scripts/normalize_wig_submit.sh | 13 - scripts/summarizePipelineStats.R | 136 ------ scripts/summarizePipelineStats_complex.R | 131 ------ 10 files changed, 1482 deletions(-) delete mode 100755 scripts/cleanFailed.sh delete mode 100755 scripts/convertBismarkReport.R delete mode 100755 scripts/fastqcSummary.py delete mode 100755 scripts/flagCheck.sh delete mode 100755 scripts/make_SummaryTable.py delete mode 100644 scripts/make_trackhubs.py delete mode 100755 scripts/normalize_wig.R delete mode 100755 scripts/normalize_wig_submit.sh delete mode 100755 scripts/summarizePipelineStats.R delete mode 100755 scripts/summarizePipelineStats_complex.R diff --git a/scripts/cleanFailed.sh b/scripts/cleanFailed.sh deleted file mode 100755 index 49842138..00000000 --- a/scripts/cleanFailed.sh +++ /dev/null @@ -1,10 +0,0 @@ -# Deletes all directories with a failed flag -ls -d */*failed* - -read -p "Are you sure? " -n 1 -r -echo # (optional) move to a new line -if [[ $REPLY =~ ^[Yy]$ ]] -then - ls -d */*failed* | cut -d'/' -f1 | xargs rm -rfv -fi - diff --git a/scripts/convertBismarkReport.R b/scripts/convertBismarkReport.R deleted file mode 100755 index e2bc8da6..00000000 --- a/scripts/convertBismarkReport.R +++ /dev/null @@ -1,84 +0,0 @@ -#!/usr/bin/env Rscript -options(echo=FALSE) -library("data.table") -suppressPackageStartupMessages(library("optparse")) -#d <- fread("/data/groups/lab_bock/fhalbrit/projects/hema_precursors//results_pipeline//results_pipeline/MPP_10_D1_1_R1//bismark_hg38/extractor/MPP_10_D1_1_R1.aln.dedup.filt.CpG_report_filt.txt") - - -optionList <- list( - make_option( c("-i", "--input"), type="character", help="Input file. A Bismark CpG report (CHR START STRAND HITCOUNT MISSCOUNT DINUCLEOTIDE CONTEXT)"), - make_option( c("-f", "--formats"), type="character", default="cov,min", help="A comma-separated list of output formats. Supported formats are: cov (Bismark coverage file: CHR START END METHPERCENT HITCOUNT MISSCOUNT), min (minimal coverage file: CHR START HITS TOTAL). Default: cov,min"), - make_option( c("-c", "--noCovFilter"), default=FALSE,type="logical", action="store_true", help="Disable coverage filter. If not set, CpG's without any coverage will be removed"), - make_option( c("-s", "--noChromFilter"), default=FALSE, type="logical", action="store_true", help="Disable chromosome filter. If not set, non-standard chromosomes (everything with an underscore in the name) will be removed"), - make_option( c("-a", "--noAdjustMinusStrand"), default=FALSE, type="logical", action="store_true", help="Disable reverse strand adjustment. If not set, the coordiantes of all sites on the reverse strand (-) will be adjusted by subtracting 1") -) -opts <- parse_args(OptionParser(option_list=optionList)) - - -if (is.null(opts$input)) { - print_help(OptionParser(option_list=optionList)) - stop("No input file provided") -} else { - cpgReport <- opts$input - filterUncovered <- !opts$noCovFilter - removeNonStandardChroms <- !opts$noChromFilter - adjustMinusStrand <- !opts$noAdjustMinusStrand - outputFormats <- strsplit(tolower(opts$formats),",")[[1]] - - message("+ Starting to convert Bismark CpG report file: ", cpgReport) - - # read in data: - message("\tReading and modifying data...") - d <- fread(cpgReport) - setnames(d, paste0("V", 1:7), c("chr", "start", "strand", "hitCount", "missCount", "dinucleotide", "context")) - - # calculate total read count: - d[, readCount:=hitCount+missCount] - - # remove unnecessary columns: - d[, c("dinucleotide", "context", "missCount"):=NULL] - - # remove uncovered regions: - if(filterUncovered) { - message("\tRemove uncovered CpG's...") - d <- d[ readCount>0,] - } - - # adjust the coordinate of C's on the (-)-strand: - if(adjustMinusStrand) { - message("\tAdjusting reverse strand coordinates...") - d[strand=="-",start := as.integer(start-1)] - } - d[, strand:=NULL] - - # aggregate all regions with identical coordinates: - message("\tAggregating regions by coordinate...") - d <- d[,list(hitCount= sum(hitCount), readCount=sum(readCount)), by=list(chr, start)] - setcolorder(d,c("chr", "start", "hitCount", "readCount")); - - # remove non-standard chromosomes (_random, unintegrated contiqs, etc.) - if(removeNonStandardChroms) { - message("\tFiltering chromosomes...") - d <- d[ !grep("_",chr),]; - } - - # write output file(s): - for(outputFormat in outputFormats) { - outName <- paste0(gsub(".txt$", "", cpgReport, perl=TRUE, ignore.case=TRUE), ".", outputFormat) - if(outputFormat == "cov") { - message("\tWriting Bismark coverage format (CHR START END METHPERCENT HITCOUNT MISSCOUNT): ", outName) - d[, methPerc:= hitCount/readCount*100] - d[, missCount:= readCount-hitCount] - write.table(d[,list(chr,start,start,methPerc,hitCount,missCount)], file=outName, sep="\t", row.names=FALSE, col.names=FALSE, quote=FALSE) - } - else if(outputFormat == "min") { - message("\tWriting minimal coverage output format (CHR START HITS TOTAL): ", outName) - write.table(d[,list(chr,start,hitCount,readCount)], file=outName, sep="\t", row.names=FALSE, col.names=FALSE, quote=FALSE) - } - else { - warning("\tUnrecognized output format: ", outputFormat) - } - } - - message("+ Finished conversion: ", cpgReport) -} diff --git a/scripts/fastqcSummary.py b/scripts/fastqcSummary.py deleted file mode 100755 index b5649c2d..00000000 --- a/scripts/fastqcSummary.py +++ /dev/null @@ -1,156 +0,0 @@ -#!/usr/bin/env python - -import os, inspect, ConfigParser, subprocess, sys, errno, glob, zipfile, csv -from argparse import ArgumentParser -#import csv -#import -#import glob -#import -#import re - -# constants: -nThreadsPerCpu = 4 -nMemPerThread = 1024 -nCpusSlurm = 8 -defaultRawDataPath = "/fhgfs/groups/lab_bsf/samples/" - -# parse user-supplied arguments: -parser = ArgumentParser(description='FASTQC') -parser.add_argument('-c', '--config-file', dest='confFile', help="Supply config file with [-c]. The path of the sample annotation sheet will be parsed from this. Example: /fhgfs/groups/lab_bock/shared/COREseq/config.txt") -parser.add_argument('-a', '--annot-file', dest='annotFile', help="Specify a sample annotation sheet directly") -parser.add_argument('-o', '--output-dir', dest='outputDir', help="Directory to write results to") -parser.add_argument('-f', '--fastqc', dest='fastqcPath', help="Full path of FASTQC exectuable", default="/cm/shared/apps/FastQC/0.11.2/fastqc") -parser.add_argument('-s', '--slurm', dest='useSlurm', action='store_true', help="Execute script on SLURM cluster.", default=False) -parser.add_argument('-q', '--quick-summary', dest='quickSummary', action='store_true', help="Skip FastQC, just write the summary report", default=False) -parser.add_argument('-p', '--parallel', dest='nCpus', help="Number of CPUs to use (going to start 4 threads per CPU)", default=1) -parser.add_argument('-d', '--raw-path', dest='rawPath', help="Raw data path") -args, remaining_args = parser.parse_known_args() - -# get input directory either directly as command line argument (highest priority) or from a config file: -annotFile = None -outputDir = None -rawDataPath = defaultRawDataPath -if args.annotFile: - annotFile = args.annotFile -elif args.confFile: - #get configurations - config = ConfigParser.ConfigParser({"results": None, "raw_data_path": defaultRawDataPath}) - config.readfp(open(os.path.abspath(args.confFile))) - annotFile = config.get("paths","psa") - if annotFile is None: - print "The config file provided does not define an annotation sheet (parameter name: 'psa')" - raise SystemExit - outputDir = config.get("paths","project_root") - if outputDir is not None: - outputDir = outputDir + "/fastqc" - rawDataPath = config.get("paths","raw_data_path") -else: - print "Supply either a config file (--config-file=X) or the full path of the annotation sheet (--annot-file=X)" - raise SystemExit - -# define relevant paths: -scriptPath = os.path.abspath(inspect.getfile(inspect.currentframe())) -fastqcPath = os.path.abspath(args.fastqcPath) -annotFile = os.path.abspath(annotFile) -if args.outputDir: - outputDir = args.outputDir -if outputDir is None: - print "No output directory specified (--output-dir=X)" - raise SystemExit -outputDir = os.path.abspath(args.outputDir) -if args.rawPath: - rawDataPath = args.rawPath -nCpus = args.nCpus - -# print some basic information: -print "FASTQC Summary" -print "----" -print "Full script path:\t" + scriptPath -print "Full FASTQC path:\t" + fastqcPath -print "Raw data root directory:\t" + rawDataPath -print "Sample sheet:\t" + annotFile -print "Output root directory:\t:" + outputDir -print "#CPUs:\t:" + str(nCpus) -print "#treads/CPU:\t:" + str(nThreadsPerCpu) -print "#mem/thread:\t:" + str(nMemPerThread) -print "----" - -# create results directory if it doesn't exist yet: -try: - os.makedirs(outputDir) -except OSError as exception: - if exception.errno != errno.EEXIST: - raise - -### MAIN JOB EXECUTION ### - -# if desired, submit the job for execution on the cluster: -if args.useSlurm: - slurmScript = outputDir + "/fastqc_slurm.sub" - slurmLog = outputDir + "/fastqc_slurm.log" - - with open(slurmScript, "w") as fout: - fout.write("#!/bin/bash\n") - fout.write("#SBATCH --job-name=fastqc\n") - fout.write("#SBATCH --mem-per-cpu=" + str(nThreadsPerCpu * nMemPerThread) + "\n") - fout.write("#SBATCH --cpus-per-task=" + str(nCpus) + "\n") - fout.write("#SBATCH -m block\n") - fout.write("#SBATCH --partition=mediumq\n") - fout.write("#SBATCH --time=24:00:00\n") - fout.write("#SBATCH --output " + slurmLog + "\n") - fout.write("echo 'Compute node:' `hostname`\n") - fout.write("echo 'Start time:' `date +'%Y-%m-%d %T'`\n") - fout.write("python " + scriptPath + " --raw-path=" + rawDataPath + " --annot-file=" + annotFile + " --parallel=" + str(nCpusSlurm) + " --output-dir=" + outputDir + "\n") - fout.write("echo 'End time:' `date +'%Y-%m-%d %T'`\n") - - subprocess.check_call(["sbatch", slurmScript]) - -# otherwise, just execute the command directly on the current machine: -# (this is what the SLURM-based execution mode will do once the job has been allocated to a specific node) -else: - # execute FastQC on all BAM files: - if not args.quickSummary: - subprocess.check_call([fastqcPath, "--version"]) - - bamFolders = {} - - with open(annotFile, "rb") as annotF: - annotDict = csv.DictReader(annotF) - for row in annotDict: - bamDir = rawDataPath + row["flowcell"] + "/" + row["flowcell"] + "_" + row["lane"] + "_samples/" - bamFile = bamDir + row["flowcell"] + "_" + row["lane"] + "#" + row["BSF_name"] + ".bam" - - if os.path.isfile(bamFile): - bamFolders[bamDir] = True - - for bamFolder in bamFolders: - subprocess.check_call(fastqcPath + " " + bamFolder+"/*.bam --threads="+str(int(nCpus) * nThreadsPerCpu) + " --noextract --outdir="+outputDir, shell=True) # N.B. can't use the proper syntax with an array for args, because FastQC cannot handle the quoted string ('...') as an input path name - - allKeys = {} - resultsMap = {} - zipSuffix = ".zip" - sep = "\t" - summaryFile = outputDir + "/summary.tsv" - - # collate summaries in one overview file: - print "Collecting summary statistics into: " + summaryFile - with open(summaryFile, "w") as fout: - for fastqcZip in glob.glob(outputDir + "/*"+zipSuffix): - curName = fastqcZip[len(outputDir+"/"):-len(zipSuffix)] - curMap = {} - #print curName - with zipfile.ZipFile(fastqcZip) as z: - with z.open(curName+"/summary.txt") as f: - for line in f: - tokens = line.split(sep) - curMap[tokens[1]] = tokens[0] - allKeys[tokens[1]] = True - resultsMap[curName] = curMap - - fout.write("Dataset" + sep + sep.join(allKeys.keys())+"\n") - for sample, curMap in resultsMap.items(): - fout.write(sample) - for k in allKeys: - fout.write(sep + curMap[k]) - fout.write("\n") - diff --git a/scripts/flagCheck.sh b/scripts/flagCheck.sh deleted file mode 100755 index 2468689b..00000000 --- a/scripts/flagCheck.sh +++ /dev/null @@ -1,23 +0,0 @@ -completed=`ls */*completed.flag 2> /dev/null | wc -l` -running=`ls */*running.flag 2> /dev/null | wc -l` -failed=`ls */*failed.flag 2> /dev/null | wc -l` -echo "completed: $completed" -echo "running: $running" -echo "failed: $failed" -ls */*.flag | xargs -n1 basename | sort | uniq -c - -if [ $failed -lt 30 ]; then -echo "List of failed flags:" -ls */*failed.flag 2> /dev/null -fi - -if [ $completed -lt 30 ]; then -echo "List of completed flags:" -ls */*completed.flag 2> /dev/null -fi - -if [ $running -lt 30 ]; then -echo "List of running flags:" -ls */*running.flag 2> /dev/null -fi - diff --git a/scripts/make_SummaryTable.py b/scripts/make_SummaryTable.py deleted file mode 100755 index 6ca15a6e..00000000 --- a/scripts/make_SummaryTable.py +++ /dev/null @@ -1,319 +0,0 @@ -#! /usr/bin/env python - -# This script loops through all the samples, -# creates a summary stats table -import csv -import os -from argparse import ArgumentParser -from pypiper import AttributeDict -import yaml - - -# Argument Parsing -# ####################################################################################### -parser = ArgumentParser(description='make_SummaryTable') -parser.add_argument('-c', '--config-file', dest='config_file', help="path to YAML config file", required=True, type=str) -parser.add_argument('--excel', dest='excel', action='store_true', help="generate extra XLS and XLSX sheet", default=False, required=False) -# Charles : On time the legacy/rigid mode will be removed -parser.add_argument('--rigid', dest='rigid', action='store_true', help="the legacy rigid mode that only takes in the hard-coded values", default=False, required=False) -args = parser.parse_args() - -with open(args.config_file, 'r') as config_file: - config_yaml = yaml.load(config_file) - config = AttributeDict(config_yaml, default=True) -paths = config.paths - - - -if not os.path.exists(paths.output_dir): - raise Exception(paths.output_dir + " : project directory does not exist!") - - -# FOR RIGID -# ####################################################################################### -fields_in = [] -fields_out = [] -if args.rigid: - # the hard-coded fields for the legacy/rigid mode - fields_in = ['sample_name','instrument_model','flowcell','lane','read_length','Read_type','organism','Genome'\ - ,'cell_type','Raw_reads','Trimmed_reads','Trimmed_rate','Aligned_reads','Aligned_rate'\ - ,'Multimap_reads','Multimap_rate','Unique_CpGs','Total_CpGs','meanCoverage',\ - 'bisulfiteConversionRate','globalMethylationMean',\ - 'K1_unmethylated_count','K1_unmethylated_meth','K3_methylated_count','K3_methylated_meth'] - fields_out = ['Sample','Instrument','Flowcell','Lane','Read Length','Read Type','Organism','Genome'\ - ,'Cell Type','Raw Reads','Trimmed Reads','Trimmed Rate','Aligned Reads','Aligned Rate'\ - ,'Multimap Reads','Multimap Rate','Unique CpGs','Total CpGs','Mean Coverage',\ - 'Bisulfite Conversion Rate',' Global Methylation Mean',\ - 'K1 Unmethylated Count','K1 Unmethylated Meth','K3 Methylated Count','K3 Methylated Meth'] - - -# Open samples CSV file -# ####################################################################################### -csv_file_path = os.path.join(os.path.dirname(args.config_file),config.metadata.sample_annotation) -print("\nOpening CSV file: " + csv_file_path) -if os.path.isfile(csv_file_path): - csv_file = open(os.path.join(os.path.dirname(args.config_file),config.metadata.sample_annotation), 'rb') - print("Found " + csv_file_path) -else: - raise Exception(csv_file_path + " : that file does not exist!") -csv_reader = csv.DictReader(csv_file) - - -# Looping over all samples -# ####################################################################################### -global_list = dict() -global_keys = dict() - -pipelines = [] -sample_count = 0 -column_count = 0 -print("\nStart iterating over samples") - -for row in csv_reader: - - sample_count += 1 - sample_name = row['sample_name'] - print("\n##### Processing sample #"+ str(sample_count) + " : " + sample_name + " #####") - - # wrap this all in a try block, so it can skip a few bad samples - # without breaking the whole thing - try: - - # Open sample TSV stat file - stats_file_dir = os.path.join(paths.output_dir,paths.results_subdir,sample_name) - stats_file_path = os.path.join(paths.output_dir,paths.results_subdir,sample_name,row['library']+'_stats.tsv') - if not os.path.isfile(stats_file_path): - for thefile in os.listdir(stats_file_dir): - if thefile.endswith("stats.tsv"): stats_file_path = os.path.join(stats_file_dir,thefile) - if os.path.isfile(stats_file_path): - stats_file = open(stats_file_path, 'rb') - print("Found: " + stats_file_path) - else: - raise Exception(stat_file_path + " : file does not exist!") - - - stats_dict = dict() - stats_dict_keys = dict() - - # Check if file has third column -> define pipelines based on that - # plus read info from file - - for line in stat_file: - - line_content = line.split('\t') - key = line_content[0] - value = line_content[1] - pip = "x" - if len(line_content) == 3: - pip = line_content[2].strip() - pipelines.append(pip) - if not pip in stats_dict: stats_dict[pip] = dict() - if not pip in stats_dict_keys: stats_dict_keys[pip] = [] - stats_dict[pip][key] = value.strip() - stats_dict_keys[pip].append(key) - - pipelines = list(set(pipelines)) - print "Pipelines: " + str(pipelines) - - - # stats_dict and stats_dict_keys are pipeline specific - for pip in pipelines: - if not pip in global_list: global_list[pip] = [] - if not pip in global_keys: global_keys[pip] = [] - - - - # if there are two pipelines make sure that certain values are present in both - missing_cols = ["Raw_reads", "Fastq_reads", "Trimmed_reads", "Trim_loss_rate"] - if len(pipelines) == 2: - for col in missing_cols: - if not col in stats_dict[pipelines[1]] and col in stats_dict[pipelines[0]]: stats_dict[pipelines[1]][col] = stats_dict[pipelines[0]][col] - if not col in stats_dict[pipelines[0]] and col in stats_dict[pipelines[1]]: stats_dict[pipelines[0]][col] = stats_dict[pipelines[1]][col] - for pip in pipelines: - stats_dict_keys[pip] = list(set(stats_dict_keys[pip] + missing_cols)) - - # Write to global list and keys - new_row = dict() - column_count = 0 - for pip in pipelines: - new_row = row.copy() - new_row.update(stats_dict[pip]) - global_list[pip].append(new_row) - global_keys[pip] = csv_reader.fieldnames + stats_dict_keys[pip] - - - except Exception as e: - - print("Sample " + sample_name + " failed. Error: " + str(e)) - -csv_file.close() -# print global_keys -# print global_list - -# Writing to Output Files -# ####################################################################################### -if not args.rigid: - - # Writing TSV file - # ####################################################################################### - - for pip in pipelines: - - pip_nam = "_" + pip - if pip_nam == "_x": pip_nam = "" - tsv_outfile_path = os.path.join(paths.output_dir,os.path.basename(paths.output_dir)+ pip_nam + '_stats_summary.tsv') - tsv_outfile = open(tsv_outfile_path, 'w') - - if global_list[pip] and global_keys[pip]: - - tsv_writer = csv.DictWriter(tsv_outfile, fieldnames=global_keys[pip], delimiter='\t') - tsv_writer.writeheader() - - for i,sample in enumerate(global_list[pip]): - tsv_writer.writerow(sample) - if args.excel: - for j,field in enumerate(global_keys[pip]): - if i == 0: xls_sheet.write(0, j, field) - xls_sheet.write(i+1, j, sample[field]) - - tsv_outfile.close() - - print("\nInput used: " + csv_file_path) - print("Results TSV file: " + tsv_outfile_path) - - - - - # Output XLS file - # ####################################################################################### - if args.excel: - - raise Exception("--excel not implemented") - - import xlwt - - for pip in pipelines: - - pip_nam = "_" + pip - if pip_nam == "_x": pip_nam = "" - - xls_book = xlwt.Workbook(encoding="utf-8") - xls_sheet_name = "Stats" + pip_nam - xls_sheet = xls_book.add_sheet(xls_sheet_name) - - # Where should this be written? Here or below? - # if args.rigid: - # for i,field in enumerate(fields_out): - # xls_sheet.write(0, i, field) - - import xlrd - import openpyxl - - # saving the XLS sheet - xls_filename = os.path.join(paths.output_dir,os.path.basename(paths.output_dir)+'_stats_summary.xls') - xls_book.save(xls_filename) - print("Results XLS file: " + xls_filename) - - # convert XLS sheet to XLSX format - xlsx_book_in = xlrd.open_workbook(xls_filename) - index = 0 - nrows = sample_count + 2 - ncols = 0 - if global_keys[pip]: ncols = len(global_keys[pip]) - else: ncols = column_count - ncols += 1 - xlsx_sheet_in = xlsx_book_in.sheet_by_index(0) - xlsx_book_out = openpyxl.Workbook() - xlsx_sheet = xlsx_book_out.active - xlsx_sheet.title = xls_sheet_name - for row in range(1, nrows): - for col in range(1, ncols): - xlsx_sheet.cell(row=row, column=col).value = xlsx_sheet_in.cell_value(row-1, col-1) - xlsx_filename = os.path.join(paths.output_dir,os.path.basename(paths.output_dir)+'_stats_summary.xlsx') - xlsx_book_out.save(xlsx_filename) - print("Results XLSX file: " + xlsx_filename) - - print("\n") - - - -# RIGID -else: - if args.excel: - raise Exception("--excel not implemented for option --rigid") - - for pip in pipelines: - - pip_nam = "_" + pip - if pip_nam == "_x": pip_nam = "" - # Open file to write to - tsv_outfile_path = os.path.join(paths.output_dir,os.path.basename(paths.output_dir)+ pip_nam + '_stats_summary.tsv') - tsv_outfile = open(tsv_outfile_path, 'w') - tsv_writer = csv.DictWriter(tsv_outfile, fieldnames=fields_out, delimiter='\t') - tsv_writer.writeheader() - - - - - # for each sample data (one element of the global list) - for sample_dict in global_list[pip]: - - new_row = dict() - # Write each field - for i in range(0,len(fields_in)): - - field = fields_in[i] - field_out = fields_out[i] - content = str('') - content_float = float(-1e10) - content_int = int(-1) - - # extract all the listed fields - # some data types might not have all the fields in stats_dict, then catch the KeyError - try: - if field == 'Trimmed_rate': - content_float = 100.0*float(sample_dict['Trimmed_reads'])/float(sample_dict['Raw_reads']) - elif field == 'Aligned_rate': - content_float = 100.0*float(sample_dict['Aligned_reads'])/float(sample_dict['Trimmed_reads']) - elif field == 'Multimap_rate': - content_float = 100.0*float(sample_dict['Multimap_reads'])/float(sample_dict['Trimmed_reads']) - elif field in sample_dict: - content = str(sample_dict[field].strip()) - else: - content = 'NA' - print("No field called: " + field) - except KeyError: - content = 'NA' - print("Data missing to calculate: " + field) - - # convert str to float or int if needed - got_comma = content.find('.') - try: - content_float = float(content) - except ValueError: - pass - if not got_comma: - content_int = int(content_float) - - # write the field for each row - if content_int > -1: - column_count += 1 - new_row[field_out] = content_int - if args.excel: xls_sheet.write(sample_count, i, content_int) - elif content_float > -1e10: - column_count += 1 - new_row[field_out] = content_float - if args.excel: xls_sheet.write(sample_count, i, content_float) - else: - column_count += 1 - new_row[field_out] = content - if args.excel: xls_sheet.write(sample_count, i, content) - - tsv_writer.writerow(new_row) - - tsv_outfile.close() - - - - - diff --git a/scripts/make_trackhubs.py b/scripts/make_trackhubs.py deleted file mode 100644 index 71f7cb0a..00000000 --- a/scripts/make_trackhubs.py +++ /dev/null @@ -1,543 +0,0 @@ -#! /usr/bin/env python -""" Create a trackhub for each sample. """ - -from argparse import ArgumentParser -import csv -import datetime -import getpass -import os -import subprocess -import yaml -from looper.looper import SAMPLE_EXECUTION_TOGGLE -from pypiper import AttributeDict - - -# Argument Parsing -# ####################################################################################### -parser = ArgumentParser(description='make_trackhubs') -parser.add_argument('-c', '--config-file', dest='config_file', help="path to YAML config file", required=True, type=str) -parser.add_argument('-f', dest='filter', action='store_false', required=False, default=True) -parser.add_argument('-v', '--visibility', dest='visibility', help='visibility mode (default: full)', required=False, default='full', type=str) -parser.add_argument('--copy', dest='copy', help='copy sepcified file types instead of creating symbolic links, example: --copy BAM-BB-BW-BED-TH', required=False, type=str) - -args = parser.parse_args() - -with open(args.config_file, 'r') as config_file: - config_yaml = yaml.load(config_file) - config = AttributeDict(config_yaml, default=True) - -trackhubs = config.trackhubs -paths = config.paths - -print(config) - -if not os.path.exists(paths.output_dir): - raise Exception(paths.output_dir + " : that project directory does not exist!") - -present_genomes = {} -subGroups_perGenome = {} -subGroups = { - "exp_category": {}, - "FACS_marker": {}, - "cell_type": {}, - "treatment": {}, - "treatment_length": {}, - "cell_count": {}, - "library": {}, - "data_type": {} -} -# add x- and y-dimension to subGroups even if they are not in the standard column selection: -subGroups[trackhubs.matrix_x] = {} -subGroups[trackhubs.matrix_y] = {} - - -csv_file_path = os.path.join(os.path.dirname(args.config_file), config.metadata.sample_annotation) -print "\nOpening CSV file: " + csv_file_path -if os.path.isfile(csv_file_path): - csv_file = open(os.path.join(os.path.dirname(args.config_file), config.metadata.sample_annotation), 'rb') # opens the csv file -else: - raise Exception(csv_file_path + " : that file does not exist!") - -try: - - csv_file_0 = open(os.path.join(os.path.dirname(args.config_file), config.metadata.sample_annotation), 'rb') - input_file_0 = csv.DictReader(csv_file_0) # creates the reader object - - pipeline = "" - genome = "" - for row in input_file_0: - if ("library" in row.keys()): - pipeline = str(row["library"]).upper() - if ("organism" in row.keys()): - genome = str(getattr(config.genomes, str(row["organism"]))) - print 'Pipeline: ' + pipeline - print 'Genome: ' + genome - print("Trackhub dir: " + trackhubs.trackhub_dir) - if pipeline != "": - pipeline += '_' - - paths.write_dir = "" - - if args.copy: - paths.write_dir = trackhubs.trackhub_dir - if not os.path.exists(paths.write_dir): - os.makedirs(paths.write_dir) - else: - paths.write_dir = paths.output_dir - if not os.path.islink(trackhubs.trackhub_dir): - os.symlink(os.path.relpath(paths.write_dir, os.path.dirname(trackhubs.trackhub_dir)), trackhubs.trackhub_dir) - print 'Linking to: ' + str(trackhubs.trackhub_dir) - else: - print 'Link already exists: ' + str(trackhubs.trackhub_dir) - print 'Writing files to: ' + paths.write_dir - - genomes_file = open(os.path.join(paths.write_dir, pipeline + 'genomes.txt'), 'w') - - track_out = os.path.join(paths.write_dir, genome) - if not os.path.exists(track_out): - os.makedirs(track_out) - print 'Writing tracks to: ' + track_out - else: - print 'Trackhubs already exists! Overwriting everything in ' + track_out - userID = os.getuid() - for root, dirs, files in os.walk(track_out, topdown=False): - for name in files: - ownerID = 0 - try: - ownerID = os.stat(os.path.join(root, name)).st_uid - except: - os.remove(os.path.join(root, name)) - if ownerID == userID: - try: - os.remove(os.path.join(root, name)) - except: - pass - for name in dirs: - ownerID = os.stat(os.path.join(root, name)).st_uid - if ownerID == userID: - try: - os.rmdir(os.path.join(root, name)) - except: - pass - - # write hub.txt - hub_file_name = pipeline + "hub.txt" - hub_file = open(os.path.join(paths.write_dir, hub_file_name), 'w') - hub_file.writelines("hub " + trackhubs.hub_name + "\n") - hub_file.writelines("shortLabel " + trackhubs.hub_name + "\n") - hub_file.writelines("longLabel " + trackhubs.hub_name + "\n") - hub_file.writelines("genomesFile " + pipeline + "genomes.txt\n") - hub_file.writelines("email " + trackhubs.email + "\n") - - # Write a HTML document. - html_out = str() - html_out_tab1 = str() - html_out_tab2 = str() - clean_title = os.path.basename(paths.output_dir).replace('_',' ') - # Write HTML header and title - html_out += '\n' - html_out += '\n' - html_out += '\n' - html_out += '\n' - html_out += '\n' - html_out += '\n' - html_out += '\n'.format(getpass.getuser()) - html_out += '\n'.format(datetime.datetime.now().isoformat()) - html_out += '\n'.format(clean_title) - html_out += '\n' - html_out += '{}\n'.format(clean_title) - html_out += '\n' - html_out += '\n' - - tableDict = dict() - - input_file = csv.DictReader(csv_file) - sample_count = 0 - - print '\nStart iterating over samples' - for row in input_file: # iterates the rows of the file in orders - - sample_count += 1 - - sample_name = row["sample_name"] - print '\nProcessing sample #' + str(sample_count) + " : " + sample_name - - tableDict[sample_name] = dict() - - if SAMPLE_EXECUTION_TOGGLE in row: - exec_flag = row[SAMPLE_EXECUTION_TOGGLE] - if exec_flag == "0" or exec_flag.lower() == "false": - print(sample_name + ": not selected") - continue - else: - print(sample_name + ": SELECTED") - - sample_path = os.path.join(paths.output_dir, paths.results_subdir, sample_name) - - present_subGroups = "\tsubGroups " - - # bsmap aligned bam files - bsmap_mapped_bam = os.path.join(sample_path, "bsmap_" + genome, sample_name + ".bam") - bsmap_mapped_bam_name = os.path.basename(bsmap_mapped_bam) - bsmap_mapped_bam_index = os.path.join(sample_path, "bsmap_" + genome, sample_name + ".bam.bai") - bsmap_mapped_bam_index_name = os.path.basename(bsmap_mapped_bam_index) - - # With the new meth bigbeds, RRBS pipeline should yield this file: - meth_bb_file = os.path.join(sample_path, "bigbed_" + genome, "RRBS_" + sample_name + ".bb") - meth_bb_name = os.path.basename(meth_bb_file) - - # bismark bigwig files - bismark_bw_file = os.path.join(sample_path, "bismark_" + genome, "extractor", sample_name + ".aln.dedup.filt.bw") - bismark_bw_name = os.path.basename(bismark_bw_file) - - # bigwigs are better actually - if not os.path.isfile(bismark_bw_file): - bismark_bw_file = os.path.join(sample_path, "bigwig_" + genome, "RRBS_" + sample_name + ".bw") - bismark_bw_name = os.path.basename(bismark_bw_file) - - # biseqMethcalling bed file - biseq_bed = os.path.join(sample_path, "biseq_" + genome, "RRBS_cpgMethylation_" + sample_name + ".bed") - biseq_bed_name = os.path.basename(biseq_bed) - - # tophat files - if args.filter: - tophat_bw_file = os.path.join(sample_path, "tophat_" + genome, sample_name + ".aln.filt_sorted.bw") - else: - tophat_bw_file = os.path.join(sample_path, "tophat_" + genome, sample_name + ".aln_sorted.bw") - tophat_bw_name = os.path.basename(tophat_bw_file) - - if os.path.isfile(tophat_bw_file) or os.path.isfile(bismark_bw_file) or os.path.isfile(meth_bb_file): - - track_out_file = os.path.join(track_out, pipeline + "trackDB.txt") - if track_out_file not in present_genomes.keys(): - # initialize a new genome - open(track_out_file, 'w').close() - genomes_file.writelines("genome " + genome.split('_')[0] + "\n") - genomes_file.writelines("trackDb " + os.path.join(genome, os.path.basename(track_out_file)) + "\n") - present_genomes[track_out_file] = [] - subGroups_perGenome[track_out_file] = subGroups - - # construct subGroups for each sample and initialize subgroups if not present - for key in subGroups_perGenome[track_out_file].keys(): - if key not in input_file.fieldnames: - continue - if not row[key] in ["NA", "", " "]: - present_subGroups += key + "=" + row[key] + " " - if not row[key] in subGroups_perGenome[track_out_file][key]: - subGroups_perGenome[track_out_file][key][row[key]] = row[key] - - # TODO NS: we should only have build these once; like so: - # Build short label - if trackhubs.short_label_column is not None: - shortLabel = row[trackhubs.short_label_column] - else: - shortLabel = "sl_" - if ("Library" in row.keys()): - shortLabel += row["library"][0] - if ("cell_type" in row.keys()): - shortLabel += "_" + row["cell_type"] - if ("cell_count" in row.keys()): - shortLabel += "_" + row["cell_count"] - - ########################################## - ### Aligned BAM files and index files - ########################################## - - if os.path.isfile(bsmap_mapped_bam): - - print " FOUND bsmap mapped file: " + bsmap_mapped_bam - - # copy or link the file to the hub directory - if args.copy and args.copy.find('BAM') > -1: - cmd = "cp " + bsmap_mapped_bam + " " + track_out - print(cmd) - subprocess.call(cmd, shell=True) - cmd = "cp " + bsmap_mapped_bam_index + " " + track_out - print(cmd) - subprocess.call(cmd, shell=True) - else: - os.symlink(os.path.relpath(bsmap_mapped_bam, track_out), os.path.join(track_out, pipeline + bsmap_mapped_bam_name)) - os.symlink(os.path.relpath(bsmap_mapped_bam_index, track_out), os.path.join(track_out, pipeline + bsmap_mapped_bam_index_name)) - - # construct track for data file - track_text = "\n\ttrack " + bsmap_mapped_bam_name + "_Meth_Align" + "\n" - track_text += "\tparent DNA_Meth_Align on\n" - track_text += "\ttype bam\n" - track_text += present_subGroups + "data_type=Meth_Align" + "\n" - track_text += "\tshortLabel " + shortLabel + "\n" - track_text += "\tlongLabel " + sample_name + "_Meth_Align" + "\n" - track_text += "\tbigDataUrl " + pipeline + bsmap_mapped_bam_name + "\n" - - tableDict[sample_name]['BAM'] = dict([('label', 'BAM'), ('link', os.path.relpath(os.path.join(track_out, pipeline + bsmap_mapped_bam_name), track_out))]) - tableDict[sample_name]['BAI'] = dict([('label', 'BAI'), ('link', os.path.relpath(os.path.join(track_out, pipeline + bsmap_mapped_bam_index_name), track_out))]) - - present_genomes[track_out_file].append(track_text) - else: - print (" No bsmap mapped bam found: " + bsmap_mapped_bam_name) - - ########################################## - ### For BigBed files - ########################################## - - if os.path.isfile(meth_bb_file): - - print " FOUND BigBed file: " + meth_bb_file - - # copy or link the file to the hub directory - if args.copy and args.copy.find('BB') > -1: - cmd = "cp " + meth_bb_file + " " + track_out - print(cmd) - subprocess.call(cmd, shell=True) - else: - os.symlink(os.path.relpath(meth_bb_file, track_out), os.path.join(track_out, meth_bb_name)) - - # construct track for data file - track_text = "\n\ttrack " + meth_bb_name + "_Meth_BB" + "\n" - track_text += "\tparent DNA_Meth_BB on\n" - track_text += "\ttype bigBed\n" - track_text += present_subGroups + "data_type=Meth_BB" + "\n" - track_text += "\tshortLabel " + shortLabel + "\n" - track_text += "\tlongLabel " + sample_name + "_Meth_BB" + "\n" - track_text += "\tbigDataUrl " + pipeline + meth_bb_name + "\n" - - tableDict[sample_name]['BB'] = dict([('label', 'BB'), ('link', os.path.relpath(os.path.relpath(os.path.join(track_out, meth_bb_name), track_out)))]) - - present_genomes[track_out_file].append(track_text) - else: - print (" No Bigbed file found: " + meth_bb_file) - - ########################################## - ### For Methylation (bismark) BIGWIG files - ########################################## - - if os.path.isfile(bismark_bw_file): - print " FOUND bismark bw: " + bismark_bw_file - # copy or link the file to the hub directory - if args.copy and args.copy.find('BW') > -1: - cmd = "cp " + bismark_bw_file + " " + track_out - print(cmd) - subprocess.call(cmd, shell=True) - else: - os.symlink(os.path.relpath(bismark_bw_file, track_out), os.path.join(track_out, bismark_bw_name)) - # add data_type subgroup (not included in sampleAnnotation) - if "Meth" not in subGroups_perGenome[track_out_file]["data_type"]: - subGroups_perGenome[track_out_file]["data_type"]["Meth"] = "Meth" - # construct track for data file - track_text = "\n\ttrack " + bismark_bw_name + "_Meth" + "\n" - track_text += "\tparent " + trackhubs.parent_track_name + " on\n" - track_text += "\ttype bigWig\n" - track_text += present_subGroups + "data_type=Meth" + "\n" - track_text += "\tshortLabel " + shortLabel + "\n" - track_text += "\tlongLabel " + sample_name + "_Meth" + "\n" - track_text += "\tbigDataUrl " + bismark_bw_name + "\n" - track_text += "\tviewLimits 0:100" + "\n" - track_text += "\tviewLimitsMax 0:100" + "\n" - track_text += "\tmaxHeightPixels 100:30:10" + "\n" - - tableDict[sample_name]['BW'] = dict([('label', 'BW'), ('link', os.path.relpath(os.path.relpath(os.path.join(track_out, bismark_bw_name), track_out)))]) - - present_genomes[track_out_file].append(track_text) - else: - print (" No bismark bw found: " + bismark_bw_file) - - ########################################## - ### For biseq BED files - ########################################## - - if os.path.isfile(biseq_bed): - - print " FOUND biseq bed file: " + biseq_bed - - # copy or link the file to the hub directory - if args.copy and args.copy.find('BED') > -1: - cmd = "cp " + biseq_bed + " " + track_out - print(cmd) - subprocess.call(cmd, shell=True) - else: - os.symlink(os.path.relpath(biseq_bed, track_out), os.path.join(track_out, biseq_bed_name)) - - tableDict[sample_name]['BED'] = dict([('label', 'BED'), ('link', os.path.relpath(os.path.join(track_out, biseq_bed_name), track_out))]) - - else: - print (" No biseq bed file found: " + biseq_bed) - - ########################################## - ### For RNA (tophat) files - ########################################## - - if os.path.isfile(tophat_bw_file): - print " FOUND tophat bw: " + tophat_bw_file - # copy or link the file to the hub directory - if args.copy and args.copy.find('TH') > -1: - cmd = "cp " + tophat_bw_file + " " + track_out + "\n" - cmd += "chmod o+r " + os.path.join(track_out, tophat_bw_name) - print(cmd) - subprocess.call(cmd, shell=True) - else: - os.symlink(os.path.relpath(tophat_bw_file, track_out), os.path.join(track_out, tophat_bw_name)) - # add data_type subgroup (not included in sampleAnnotation) - if "RNA" not in subGroups_perGenome[track_out_file]["data_type"]: - subGroups_perGenome[track_out_file]["data_type"]["RNA"] = "RNA" - # construct track for data file - track_text = "\n\ttrack " + tophat_bw_name + "_RNA" + "\n" - track_text += "\tparent " + trackhubs.parent_track_name + " on\n" - track_text += "\ttype bigWig\n" - track_text += present_subGroups + "data_type=RNA" + "\n" - track_text += "\tshortLabel " + shortLabel + "\n" - track_text += "\tlongLabel " + sample_name + "_RNA" + "\n" - track_text += "\tbigDataUrl " + tophat_bw_name + "\n" - track_text += "\tautoScale on" + "\n" - - tableDict[sample_name]['TH'] = dict([('label', 'BW'), ('link', os.path.relpath(os.path.join(track_out, tophat_bw_name), track_out))]) - - present_genomes[track_out_file].append(track_text) - else: - print (" No tophat bw found: " + tophat_bw_file) - - # write composit-header followed by the individual tracks to a genome specific trackDB.txt - composit_text = "" - for key in present_genomes.keys(): - # construct composite header - composit_text += "\ntrack " + str(trackhubs.parent_track_name) + "\n" - composit_text += "compositeTrack on" - count = 0 - dim_text = "dimensions dimX=" + str(trackhubs.matrix_x) + " dimY=" + str(trackhubs.matrix_y) - for subGroup in subGroups_perGenome[key].keys(): - if len(subGroups_perGenome[key][subGroup]) < 1: - continue - if not subGroup == str(trackhubs.matrix_x) and not subGroup == str(trackhubs.matrix_y): - dim_text += " dimA=" + subGroup - count += 1 - composit_text += "\nsubGroup" + str(count) + " " + subGroup + " " + subGroup + " " - for type in subGroups_perGenome[key][subGroup].keys(): - composit_text += type + "=" + subGroups_perGenome[key][subGroup][type] + " " - composit_text += "\nshortLabel " + str(trackhubs.parent_track_name) + "\n" - composit_text += "longLabel " + str(trackhubs.parent_track_name) + "\n" - composit_text += "type bigWig" + "\n" - composit_text += "color 0,60,120" + "\n" - composit_text += "spectrum on" + "\n" - composit_text += "visibility " + args.visibility + "\n" - composit_text += dim_text + "\n" - composit_text += "sortOrder " + str(trackhubs.sortOrder) + "\n" - - # write composite header - trackDB = open(key, 'a') - trackDB.writelines(composit_text) - # write individual tracks - for i in range(len(present_genomes[key])): - trackDB.writelines(present_genomes[key][i]) - super_text = "\n" - super_text += "track DNA_Meth_Align\n" - super_text += "shortLabel DNA_Meth_Align\n" - super_text += "longLabel DNA_Meth_Align\n" - super_text += "superTrack on\n" - super_text += "\n" - super_text += "track DNA_Meth_BB\n" - super_text += "shortLabel DNA_Meth_BB\n" - super_text += "longLabel DNA_Meth_BB\n" - super_text += "superTrack on\n" - - trackDB.writelines(super_text) - trackDB.close() - - report_name = pipeline + 'report.html' - - html_out += '\n' - html_out += '

{} Project

\n'.format(clean_title) - html_out += '\n' - - today = datetime.datetime.now() - #html_out += '

Last updated on ' + str(today.day) +'/'+ str(today.month) +'/'+ str(today.year) + ' at ' + str(today.hour) +':'+ str(today.minute) +'

\n' - html_out += '


\n' - - html_out += '

Useful Links

\n' - tsv_stats_name = os.path.basename(paths.output_dir)+'_stats_summary.tsv' - tsv_stats_path = os.path.relpath(os.path.join(paths.output_dir,tsv_stats_name),track_out) - xls_stats_name = os.path.basename(paths.output_dir)+'_stats_summary.xls' - xls_stats_path = os.path.relpath(os.path.join(paths.output_dir,xls_stats_name),track_out) - xlsx_stats_name = os.path.basename(paths.output_dir)+'_stats_summary.xlsx' - xlsx_stats_path = os.path.relpath(os.path.join(paths.output_dir,xlsx_stats_name),track_out) - - if os.path.isfile(os.path.join(paths.write_dir,tsv_stats_name)): - if os.path.isfile(os.path.join(paths.write_dir,xls_stats_name)): - if os.path.isfile(os.path.join(paths.write_dir,xlsx_stats_name)): - html_out += '

Stats summary table: {} {} {}

\n'.format(tsv_stats_path,'TSV',xls_stats_path,'XLS', xlsx_stats_path,'XLSX') - else: - html_out += '

Stats summary table: {} {}

\n'.format(tsv_stats_path,'TSV',xls_stats_path,'XLS') - else: - html_out += '

Stats summary table: {}

\n'.format(tsv_stats_path,'TSV') - url = str(trackhubs.url).replace(':','%3A').replace('/','%2F') - paths.ucsc_browser_link = 'https://genome-euro.ucsc.edu/cgi-bin/hgTracks?db='+genome.split('_')[0]+'&hubUrl='+url+'%2F'+hub_file_name - html_out += '

UCSC Genome Browser: {}

\n'.format(paths.ucsc_browser_link,'Link') - html_out += '


\n' - - html_file_name = os.path.join(track_out, report_name) - file_handle = open(name=html_file_name, mode='w') - file_handle.write(html_out) - - - html_out_tab = '

Data Files

\n' - html_out_tab += '\n' - html_out_tab += '\n' - html_out_tab += '\n' - html_out_tab += '\n' - html_out_tab += '\n' - html_out_tab += '\n' - html_out_tab += '\n' - html_out_tab += '\n' - html_out_tab += '\n' - html_out_tab += '\n' - key_list = tableDict.keys() - key_list.sort() - counter = 0 - for key in key_list: - counter += 1 - value = tableDict[key] - html_out_tab += '\n' - html_out_tab += '\n'.format(str(counter)+'.') - html_out_tab += '\n'.format(key) - html_out_tab += '\n'.format(value['BAM']['link'],value['BAM']['label']) - html_out_tab += '\n'.format(value['BAI']['link'],value['BAI']['label']) - html_out_tab += '\n'.format(value['BB']['link'],value['BB']['label']) - html_out_tab += '\n'.format(value['BW']['link'],value['BW']['label']) - html_out_tab += '\n'.format(value['BED']['link'],value['BED']['label']) - html_out_tab += '\n' - html_out_tab += '
Sample NameAligned BAMBAM IndexBigBedBigWigBiseq Bed
{}{}{}{}{}{}{}
\n' - file_handle.write(html_out_tab) - - html_out = '


\n' - html_out += '

This report was generated with software of the Biomedical Sequencing Facility: www.biomedical-sequencing.at

\n' - html_out += '

Contact: bsf@cemm.oeaw.ac.at

\n' - html_out += '


\n' - html_out += '

Valid XHTML 1.0 Transitional\n' - html_out += 'Valid CSS!

' - html_out += '\n' - html_out += '\n' - html_out += '\n' - - file_handle.write(html_out) - file_handle.close() - - html_link_name = os.path.join(track_out, "index.html") - os.symlink(os.path.relpath(html_file_name,track_out),html_link_name) - - cmd = "cp /scratch/lab_bsf/projects/BSA_0000_RRBS_Global_Report/styles.css " + track_out - subprocess.call(cmd, shell=True) - cmd = "chmod -R go+rX " + paths.write_dir - subprocess.call(cmd, shell=True) - - hub_file_link = str(trackhubs.url) + "/" + hub_file_name - report_link = str(trackhubs.url) + "/" + genome + "/" - link_string = 'Report ' + report_link + '\n' - link_string += 'UCSCbrowser ' + paths.ucsc_browser_link + '\n' - print '\nDONE!' - print link_string - - link_file = open(name=os.path.join(paths.write_dir, pipeline + 'links.txt'), mode='w') - link_file.write(link_string) - link_file.close() - -finally: - csv_file.close() diff --git a/scripts/normalize_wig.R b/scripts/normalize_wig.R deleted file mode 100755 index 38b0f443..00000000 --- a/scripts/normalize_wig.R +++ /dev/null @@ -1,67 +0,0 @@ -#!/usr/bin/env Rscript - -library(data.table) -suppressPackageStartupMessages(library("optparse")) - -##scale=10000000 -##genome="mm10" -##results_dir="/scratch/lab_bock/shared/projects/geissmann/results_pipeline/results_pipeline/" -##stats_path="/scratch/lab_bock/shared/projects/geissmann/results_pipeline/results_pipeline/ALL_stats_summary.tsv" - -# specify our desired options in a list -option_list = list( - make_option(c("-r", "--results_dir"), type="character", help="Input Results folder (REQUIRED)"), - make_option(c("-g", "--genome"), type="character", help="Genome used for alignment (REQUIRED)"), - make_option(c("-s", "--stats"), type="character", help="Alignment stats table for all samples (REQUIRED)"), - make_option(c("-n", "--scale"), type="character", help="Normalization scale (REQUIRED)") - ) - -opt = parse_args(OptionParser(option_list=option_list)) -if (length(opt)<4) { - print_help(OptionParser(option_list=option_list)) -}else { - results_dir=opt$results_dir - genome=opt$genome - stats_path=opt$stats - scale=opt$scale -} - -print(results_dir) -print(genome) -print(stats_path) -print(scale) - -chroSizes_path=paste0("/data/groups/lab_bock/shared/resources/genomes/",genome,"/",genome,".chromSizes") - - - - -stats=fread(stats_path) -stats=stats[pipeline=="rnaTopHat"] -stats[,wigPath:=paste0(results_dir,"/",sampleName,"/tophat_",genome,"/",sampleName,".aln_sorted.wig"),] - -for (i in c(2:nrow(stats))){ - sampleName=stats[i]$sampleName - message(sampleName) - wigFileName=stats[i]$wigPath - mappedReads=stats[i]$Aligned_reads - if (file.exists(wigFileName)){ - system(paste0("sed 's/ \\+/\\t/g' ",wigFileName," > ", wigFileName,"_temp",sep="")) - wig=fread(paste0(wigFileName,"_temp"),header=FALSE) - wig[V1=="variableStep",V3:=paste0(V1," ",V2)] - wig[grep("variableStep",V3),V1:=NA] - wig[grep("variableStep",V3),V2:=NA] - wig[,V2:=round(as.numeric(V2)/mappedReads*scale,2),] - wig[,c("V1","V2"):=list(as.character(V1),as.character(V2)),] - wig[grep("variableStep",V3),c("V1","V2"):=list(V3,"")] - wig[,V3:=NULL,] - write.table(wig,sub(".wig","_norm.wig_temp",wigFileName),sep="\t",col.names=FALSE,row.names=FALSE,quote=FALSE) - system(paste0("sed 's/\t$//g' ",sub(".wig","_norm.wig_temp",wigFileName)," > ", sub(".wig","_norm.wig",wigFileName))) - system(paste("wigToBigWig",sub(".wig","_norm.wig",wigFileName),chroSizes_path,sub(".wig","_norm.bw",wigFileName),sep=" ")) - system(paste("rm ",sub(".wig",".wig_temp",wigFileName))) - system(paste("rm ",sub(".wig","_norm.wig_temp",wigFileName))) - }else{ - message(paste0("File not found. Skipping: ",wigFileName)) - next} -} - diff --git a/scripts/normalize_wig_submit.sh b/scripts/normalize_wig_submit.sh deleted file mode 100755 index 374eb7b5..00000000 --- a/scripts/normalize_wig_submit.sh +++ /dev/null @@ -1,13 +0,0 @@ -#! /bin/bash - -scale=10000000 -genome="mm10" -results_dir="/scratch/lab_bock/shared/projects/geissmann/results_pipeline/results_pipeline/" -stats_path="/scratch/lab_bock/shared/projects/geissmann/results_pipeline/results_pipeline/ALL_stats_summary.tsv" - -logdir="$results_dir/log/" -mkdir -p $logdir - - - -sbatch --export=NONE --get-user-env=L --job-name=normalize_wig --ntasks=1 --cpus-per-task=1 --mem-per-cpu=8000 --partition=longq --time=2-00:00:00 -o ${logdir}/normalize_wig_%j.log normalize_wig.R -g $genome -n $scale -r $results_dir -s $stats_path \ No newline at end of file diff --git a/scripts/summarizePipelineStats.R b/scripts/summarizePipelineStats.R deleted file mode 100755 index 4f7f150f..00000000 --- a/scripts/summarizePipelineStats.R +++ /dev/null @@ -1,136 +0,0 @@ -#!/usr/bin/env Rscript -options(echo=FALSE); -library(data.table) -library(reshape2) #no longer necessary after data.table 1.9.5?? -suppressPackageStartupMessages(library("optparse")) - -# specify our desired options in a list -option_list = list( -make_option(c("-i", "--inputFolder"), type="character", help="Input Results folder (REQUIRED)")) - -opt = parse_args(OptionParser(option_list=option_list)) -if (is.null(opt$inputFolder)) { - print_help(OptionParser(option_list=option_list)); - inputFolder = "/fhgfs/groups/lab_bock/shared/COREseq/results_pipeline3" -# q(); -} else { - inputFolder=opt$inputFolder -} -message("input folder: ", inputFolder); -pipeDirs = list.dirs(inputFolder, recursive=FALSE) - -message("Read all *_stats.tsv files in the pipeline results folder") -results=list() -dir = pipeDirs[[1]]; -for (dir in pipeDirs) { - message(dir); - statFiles = list.files(dir, pattern="_stats", recursive=FALSE) - statFiles2 = list.files(dir, pattern="stats_", recursive=FALSE) - statFiles = c(statFiles, statFiles2) - for (statFile in statFiles) { - message(statFile); - pipeline = gsub("_stats.tsv", "", statFile) - pipeline = gsub("stats_", "", pipeline) - statPath = paste0(dir, "/", statFile); - # Not the best, but I had to put this in just in case - # there are empty lines in the stat file; this removes them - message(":") - system(paste0("sed -i '/^\\s*$/d' ", statPath)) - message(":") - a = fread(statPath) - setnames(a, c("key", "value")) - a[,key:=gsub(" ", "_", key)] # Change spaces to underscores - #Order keys as factors, to maintain order through later cast. - a[,key:=factor(key, levels=unique(key))] - #setkey(a, "key") - a[,sampleName:=basename(dir)] - a[,pipeline:=pipeline] - sampleName = basename(dir) - if (is.null(results[[pipeline]])) { results[[pipeline]] = list(); } - results[[pipeline]][[sampleName]] = a; - } -} -if (length(results) ==0) { - stop("No stats files found."); -} -results -#Combined, divided by pipeline -resultsDT = lapply(results, function(x) { do.call(rbind, x); }) - -# Select latest for identical statistics -resultsDT = lapply(resultsDT, function(x) { x[,list(value=value[length(value)]), by=c("key", "sampleName", "pipeline"), roll=+Inf] }) - -# Cast to wide format -resultsMat = lapply(resultsDT, dcast, formula= "... ~ key") -resultsMat = lapply(resultsMat, as.data.table) -# Convert number-only cols to numerics, so I can do some stats below. -numToNumeric = function(DT) { - return(DT[,lapply(.SD, function(x) { if(!any(grepl("[a-zA-Z:_\\-]", x))) { return(as.numeric(x)); } else { return(x)} })]) -} -#lapply(resultsMat, sapply, mode) -resultsMat = lapply(resultsMat, numToNumeric) - -################################################################################ -# Do any pipeline-specific calculations here -################################################################################ -nofail = function(x) { - tryCatch( {x}, error = function(e) { message("Pipeline-specific summary error: ", e); } ) -} - #WGBS - if ("WGBS" %in% names(resultsMat) ) { - nofail( { resultsMat$WGBS[, total_efficiency := (Deduplicated_reads)/Raw_reads] }) - nofail( { resultsMat$WGBS[, trim_loss_rate := (Raw_reads - Trimmed_reads)/Raw_reads] }) - nofail( { resultsMat$WGBS[, alignment_rate := (Aligned_reads)/Trimmed_reads] }) - nofail( { resultsMat$WGBS[, dupe_loss_rate := (Aligned_reads - Deduplicated_reads)/Aligned_reads] }) - nofail( { resultsMat$WGBS[, filt_loss_rate := (Deduplicated_reads - Filtered_reads)/Deduplicated_reads] }) - } - - if ("RRBS" %in% names(resultsMat) ) { - nofail( { resultsMat$RRBS[, total_efficiency := (Aligned_reads)/Raw_reads] }) - nofail( { resultsMat$RRBS[, trim_loss_rate := (Raw_reads - Trimmed_reads)/Raw_reads] }) - nofail( { resultsMat$RRBS[, alignment_rate := (Aligned_reads)/Trimmed_reads] }) - #nofail( { resultsMat$RRBS[, dupe_loss_rate := (Aligned_reads - Deduplicated_reads)/Aligned_reads] }) - #nofail( { resultsMat$RRBS[, filt_loss_rate := (Deduplicated_reads - Filtered_reads)/Deduplicated_reads] }) - } - - # Tophat -if ("rnaTopHat" %in% names(resultsMat) ) { - nofail( { resultsMat$rnaTopHat[, total_efficiency := Filtered_reads/Raw_reads] }) - nofail( { resultsMat$rnaTopHat[, trim_loss_rate := (Raw_reads - Trimmed_reads)/Raw_reads] }) - nofail( { resultsMat$rnaTopHat[, alignment_rate := (Aligned_reads)/Trimmed_reads] }) - nofail( { resultsMat$rnaTopHat[, dupe_loss_rate := (Filtered_reads - Deduplicated_reads)/Filtered_reads] }) - nofail( { resultsMat$rnaTopHat[, filt_loss_rate := (Aligned_reads - Filtered_reads)/Aligned_reads] }) - } - - # Bitseq -if ("rnaBitSeq" %in% names(resultsMat) ) { - nofail( { resultsMat$rnaBitSeq[, total_efficiency := Filtered_reads/Raw_reads] }) - nofail( { resultsMat$rnaBitSeq[, trim_loss_rate := (Raw_reads - Trimmed_reads)/Raw_reads] }) - nofail( { resultsMat$rnaBitSeq[, alignment_rate := (Aligned_reads)/Trimmed_reads] }) - nofail( { resultsMat$rnaBitSeq[, dupe_loss_rate := (Filtered_reads - Deduplicated_reads)/Filtered_reads] }) - nofail( { resultsMat$rnaBitSeq[, filt_loss_rate := (Aligned_reads - Filtered_reads)/Aligned_reads] }) - nofail( { resultsMat$rnaBitSeq[, ERCC_alignment_rate := (ERCC_aligned_reads)/Trimmed_reads] }) - } - -################################################################################ -# Write results -################################################################################ -commonCols = Reduce(intersect, lapply(resultsMat, colnames)); -commonList = lapply(resultsMat, function(x) { x[,commonCols, with=FALSE] }) -commonTable = do.call(rbind, commonList) - - -# Write individual result tables for each pipeline -pipelines = names(resultsMat) -for (p in pipelines) { - pipeStatFile = paste0(inputFolder, "/", p, "_stats_summary.tsv") - message("Writing pipeline stats table: ", pipeStatFile) - write.table(resultsMat[[p]], pipeStatFile, sep="\t",row.names=FALSE,quote=FALSE) -} -if (length(names(resultsMat)) > 1 ) { # only if there are multiple pipelines -# Produce an additional table with only common features -commonTableFile = paste0(inputFolder, "/ALL_stats_summary.tsv"); -message("Writing common table: ", commonTableFile); -write.table(commonTable, commonTableFile,sep="\t",row.names=FALSE,quote=FALSE) -} - diff --git a/scripts/summarizePipelineStats_complex.R b/scripts/summarizePipelineStats_complex.R deleted file mode 100755 index d0888b30..00000000 --- a/scripts/summarizePipelineStats_complex.R +++ /dev/null @@ -1,131 +0,0 @@ -#!/usr/bin/env Rscript -options(echo=FALSE); -library(data.table) -library(reshape2) #no longer necessary after data.table 1.9.5?? -suppressPackageStartupMessages(library("optparse")) - -# specify our desired options in a list -option_list = list( -make_option(c("-i", "--inputFolder"), type="character", help="Input Results folder (REQUIRED)")) - -opt = parse_args(OptionParser(option_list=option_list)) -if (is.null(opt$inputFolder)) { - print_help(OptionParser(option_list=option_list)); - inputFolder = "/fhgfs/groups/lab_bock/shared/COREseq/results_pipeline3" -# q(); -} else { - inputFolder=opt$inputFolder -} - -message("input folder: ", inputFolder); -pipeDirs = list.dirs(inputFolder, recursive=FALSE) - -message("Read all *_stats.txt files in the pipeline results folder") -results=list() -dir = pipeDirs[[1]]; -for (dir in pipeDirs) { - message(dir); - statFiles = list.files(dir, pattern="_stats.tsv") - statFiles2 = list.files(dir, pattern="stats_") - statFiles = c(statFiles, statFiles2) - for (statFile in statFiles) { - message(statFile); - pipeline = gsub("_stats.tsv", "", statFile) - pipeline = gsub("stats_", "", pipeline) - statPath = paste0(dir, "/", statFile); - a = fread(statPath) - setnames(a, c("key", "value")) - a[,key:=gsub(" ", "_", key)] # Change spaces to underscores - #Order keys as factors, to maintain order through later cast. - a[,key:=factor(key, levels=unique(key))] - #setkey(a, "key") - a[,sampleName:=basename(dir)] - a[,pipeline:=pipeline] - sampleName = basename(dir) - if (is.null(results[[pipeline]])) { results[[pipeline]] = list(); } - results[[pipeline]][[sampleName]] = a; - } -} -if (length(results) ==0) { - stop("No stats files found."); -} -results -#Combined, divided by pipeline -resultsDT = lapply(results, function(x) { do.call(rbind, x); }) - -# Select latest for identical statistics -resultsDT = lapply(resultsDT, function(x) { x[,list(value=value[length(value)]), by=c("key", "sampleName", "pipeline"), roll=+Inf] }) - -# Cast to wide format -resultsMat = lapply(resultsDT, dcast, formula= "... ~ key") -resultsMat = lapply(resultsMat, as.data.table) -# Convert number-only cols to numerics, so I can do some stats below. -numToNumeric = function(DT) { - return(DT[,lapply(.SD, function(x) { if(!any(grepl("[a-zA-Z:_\\-]", x))) { return(as.numeric(x)); } else { return(x)} })]) -} -resultsMat = lapply(resultsMat, numToNumeric) -#lapply(resultsMat, sapply, mode) - -################################################################################ -# Do any pipeline-specific calculations here -################################################################################ - -#WGBS - -if ("WGBS" %in% names(resultsMat)){ - resultsMat$WGBS[, total_efficiency := (Deduplicated_reads)/Raw_reads] - resultsMat$WGBS[, trim_loss_rate := (Raw_reads - Trimmed_reads)/Raw_reads] - resultsMat$WGBS[, alignment_rate := (Aligned_reads)/Trimmed_reads] - resultsMat$WGBS[, dupe_loss_rate := (Aligned_reads - Deduplicated_reads)/Aligned_reads] - resultsMat$WGBS[, filt_loss_rate := (Deduplicated_reads - Filtered_reads)/Deduplicated_reads] -} - - -# Tophat -if ("rnaTopHat" %in% names(resultsMat)){ - if ("Filtered_reads" %in% names(resultsMat$rnaTopHat)){ - resultsMat$rnaTopHat[, total_efficiency := Filtered_reads/Raw_reads]} - resultsMat$rnaTopHat[, trim_loss_rate := (Raw_reads - Trimmed_reads)/Raw_reads] - resultsMat$rnaTopHat[, alignment_rate := (Aligned_reads)/Trimmed_reads] - if ("Filtered_reads" %in% names(resultsMat$rnaTopHat)){ - if ("Deduplicated_reads" %in% names(resultsMat$rnaTopHat)){ - resultsMat$rnaTopHat[, dupe_loss_rate := (Filtered_reads - Deduplicated_reads)/Filtered_reads]} - resultsMat$rnaTopHat[, filt_loss_rate := (Aligned_reads - Filtered_reads)/Aligned_reads]} - else if ("Deduplicated_reads" %in% names(resultsMat$rnaTopHat)){ - resultsMat$rnaTopHat[, dupe_loss_rate := (Aligned_reads - Deduplicated_reads)/Aligned_reads]} -} -# Bitseq -if ("rnaBitSeq" %in% names(resultsMat)){ - if ("Filtered_reads" %in% names(resultsMat$rnaTopHat)){ - resultsMat$rnaBitSeq[, total_efficiency := Filtered_reads/Raw_reads]} - resultsMat$rnaBitSeq[, trim_loss_rate := (Raw_reads - Trimmed_reads)/Raw_reads] - resultsMat$rnaBitSeq[, alignment_rate := Aligned_reads/Trimmed_reads] - if ("Filtered_reads" %in% names(resultsMat$rnaTopHat)){ - resultsMat$rnaBitSeq[, dupe_loss_rate := (Filtered_reads - Deduplicated_reads)/Filtered_reads] - resultsMat$rnaBitSeq[, filt_loss_rate := (Aligned_reads - Filtered_reads)/Aligned_reads]} - else {resultsMat$rnaBitSeq[, dupe_loss_rate := (Aligned_reads - Deduplicated_reads)/Aligned_reads]} - resultsMat$rnaBitSeq[, ERCC_alignment_rate := (ERCC_aligned_reads)/Trimmed_reads] -} - -################################################################################ -# Write results -################################################################################ -commonCols = Reduce(intersect, lapply(resultsMat, colnames)); -commonList = lapply(resultsMat, function(x) { x[,commonCols, with=FALSE] }) -commonTable = do.call(rbind, commonList) - - -# Write individual result tables for each pipeline -pipelines = names(resultsMat) -for (p in pipelines) { - pipeStatFile = paste0(inputFolder, "/", p, "_stats_summary.tsv") - message("Writing pipeline stats table: ", pipeStatFile) - write.table(resultsMat[[p]], pipeStatFile, sep="\t",row.names=FALSE,quote=FALSE) -} - -# Produce an additional table with only common features -commonTableFile = paste0(inputFolder, "/ALL_stats_summary.tsv"); -message("Writing common table: ", commonTableFile); -write.table(commonTable, commonTableFile,sep="\t",row.names=FALSE,quote=FALSE) - - From f58fc742260264c7d4a29e9a6c880fbe1470e018 Mon Sep 17 00:00:00 2001 From: nsheff Date: Thu, 15 Jun 2017 10:53:26 -0400 Subject: [PATCH 12/94] Fix version requirements --- setup.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/setup.py b/setup.py index f8ec0933..00fecc93 100644 --- a/setup.py +++ b/setup.py @@ -14,7 +14,8 @@ for line in reqs_file: if not line.strip(): continue - DEPENDENCIES.append(line.split("=")[0].rstrip("<>")) + #DEPENDENCIES.append(line.split("=")[0].rstrip("<>")) + DEPENDENCIES.append(line) # numexpr for pandas try: From 13d553b82361df142983a6e07057189a70b5b261 Mon Sep 17 00:00:00 2001 From: nsheff Date: Thu, 15 Jun 2017 12:02:13 -0400 Subject: [PATCH 13/94] enable pipeline_interfaces attribute alongside pipelines_dir --- looper/models.py | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/looper/models.py b/looper/models.py index 3f65ac91..dd6c7383 100644 --- a/looper/models.py +++ b/looper/models.py @@ -624,6 +624,27 @@ def parse_config_file(self, subproject=None): _LOGGER.debug("Metadata: %s", str(self.metadata)) delattr(self, "paths") + # In looper 0.6, we added pipeline_interfaces to metadata + # For backwards compatibility, merge it with pipelines_dir + + if "metadata" in config: + if "pipelines_dir" in self.metadata: + _LOGGER.warning("Looper v0.6 suggests " + "switching from pipelines_dir to " + " pipeline_interfaces. See docs for details.") + if "pipeline_interfaces" in self.metadata: + if "pipelines_dir" in self.metadata: + _LOGGER.error("You defined both 'pipeline_interfaces' and" + " 'pipelines_dir'. Please remove your 'pipelines_dir' definition.") + raise AttributeError("Extra pipelines_dir attribute.") + else: + self.metadata.pipelines_dir = self.metadata.pipeline_interfaces + + _LOGGER.debug("Adding pipeline_interfaces to " + "pipelines_dir. New value: {}". + format(self.metadata.pipelines_dir)) + + # Ensure required absolute paths are present and absolute. for var in self.required_metadata: if var not in self.metadata: From 5410cd0978e8fa5d4541dc99f77b70b369b93317 Mon Sep 17 00:00:00 2001 From: Vince Reuter Date: Thu, 15 Jun 2017 14:47:08 -0400 Subject: [PATCH 14/94] move extra attribute explanation to exception message; downgrade message severity --- looper/models.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/looper/models.py b/looper/models.py index dd6c7383..33748bff 100644 --- a/looper/models.py +++ b/looper/models.py @@ -358,7 +358,7 @@ def __init__(self, config_file, subproject=None, permissive=True, file_checks=False, compute_env_file=None, no_environment_exception=None, no_compute_exception=None): - _LOGGER.info("Creating %s from file: '%s'", + _LOGGER.debug("Creating %s from file: '%s'", self.__class__.__name__, config_file) super(Project, self).__init__() @@ -413,7 +413,7 @@ def __init__(self, config_file, subproject=None, self.config_file = _os.path.abspath(config_file) # Parse config file - _LOGGER.info("Parsing %s config file", self.__class__.__name__) + _LOGGER.debug("Parsing %s config file", self.__class__.__name__) if subproject: _LOGGER.info("Using subproject: '{}'".format(subproject)) self.parse_config_file(subproject) @@ -634,9 +634,10 @@ def parse_config_file(self, subproject=None): " pipeline_interfaces. See docs for details.") if "pipeline_interfaces" in self.metadata: if "pipelines_dir" in self.metadata: - _LOGGER.error("You defined both 'pipeline_interfaces' and" - " 'pipelines_dir'. Please remove your 'pipelines_dir' definition.") - raise AttributeError("Extra pipelines_dir attribute.") + raise AttributeError( + "You defined both 'pipeline_interfaces' and " + "'pipelines_dir'. Please remove your " + "'pipelines_dir' definition.") else: self.metadata.pipelines_dir = self.metadata.pipeline_interfaces From 80611e611d5c7494beb8d151b6981aaaede350c3 Mon Sep 17 00:00:00 2001 From: nsheff Date: Fri, 16 Jun 2017 08:33:39 -0400 Subject: [PATCH 15/94] Fix import of scripts dir --- looper/models.py | 1 + setup.py | 4 +++- 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/looper/models.py b/looper/models.py index 33748bff..f597a6aa 100644 --- a/looper/models.py +++ b/looper/models.py @@ -1177,6 +1177,7 @@ def make_sample(data): try: return pairing[self.alpha_cased(data.library)](data) except (AttributeError, KeyError): + _LOGGER.debug("Error making child subclass") return Sample(data) return make_sample diff --git a/setup.py b/setup.py index 00fecc93..1e102fe0 100644 --- a/setup.py +++ b/setup.py @@ -43,7 +43,9 @@ def get_static(name, condition=None): return [i for i in filter(lambda x: eval(condition), static)] # scripts to be added to the $PATH -scripts = get_static("scripts", condition="'.' in x") +# scripts = get_static("scripts", condition="'.' in x") +# scripts removed (TO remove this) +scripts = None with open("looper/_version.py", 'r') as versionfile: version = versionfile.readline().split()[-1].strip("\"'\n") From ea3f005b69476ecf9815842b59badb0c6711a4d9 Mon Sep 17 00:00:00 2001 From: nsheff Date: Fri, 16 Jun 2017 08:46:55 -0400 Subject: [PATCH 16/94] Move add_sample_sheet from looper back into models --- looper/looper.py | 1 - looper/models.py | 2 ++ 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/looper/looper.py b/looper/looper.py index f83695d4..8ba8ad9c 100755 --- a/looper/looper.py +++ b/looper/looper.py @@ -791,7 +791,6 @@ def main(): args.config_file, args.subproject, file_checks=args.file_checks, compute_env_file=getattr(args, 'env', None)) - prj.add_sample_sheet() prj.finalize_pipelines_directory() _LOGGER.info("Results subdir: " + prj.metadata.results_subdir) diff --git a/looper/models.py b/looper/models.py index f597a6aa..fce79c95 100644 --- a/looper/models.py +++ b/looper/models.py @@ -441,6 +441,8 @@ def __init__(self, config_file, subproject=None, self.sheet = None self.samples = list() + self.add_sample_sheet() + @property def default_cmpenv_file(self): From 1ea7642fef6dd1cf48fffdf830d61917e6b11867 Mon Sep 17 00:00:00 2001 From: Vince Reuter Date: Fri, 16 Jun 2017 09:42:28 -0400 Subject: [PATCH 17/94] inform about subtypes found --- looper/models.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/looper/models.py b/looper/models.py index fce79c95..d415d66f 100644 --- a/looper/models.py +++ b/looper/models.py @@ -1171,6 +1171,10 @@ def make_sample(data): lambda maybe_class: inspect.isclass(maybe_class) and issubclass(maybe_class, Sample)) + _LOGGER.debug("Sample subtypes: %s", + ", ".join([subtype.__name__ + for subtype in sample_types])) + # TODO: perhaps modify or alter handling of need for __library__. pairing = {self.alpha_cased(sample_class.__library__): sample_class for sample_type, sample_class in sample_types} From 992622855f4715ca3a205b94b8f7b161fc15a5eb Mon Sep 17 00:00:00 2001 From: Vince Reuter Date: Fri, 16 Jun 2017 09:45:58 -0400 Subject: [PATCH 18/94] reflect addition of sample sheet in project ctor --- tests/conftest.py | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/conftest.py b/tests/conftest.py index 01135826..51ee20f7 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -432,7 +432,6 @@ def proj(request): data in file pointed to by `request` class """ p = _create(request, Project) - p.add_sample_sheet() p.finalize_pipelines_directory() return p From 91da40f42edc87f3968159a032c34d2109433a3e Mon Sep 17 00:00:00 2001 From: Vince Reuter Date: Fri, 16 Jun 2017 09:46:41 -0400 Subject: [PATCH 19/94] trigger build --- .travis.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.travis.yml b/.travis.yml index 6c9aadd0..d1221fc9 100644 --- a/.travis.yml +++ b/.travis.yml @@ -13,5 +13,6 @@ install: script: pytest branches: only: + - 0.6-rc2 - dev - master From 1af7860ad60d346839e71fe90080469248234b35 Mon Sep 17 00:00:00 2001 From: Vince Reuter Date: Fri, 16 Jun 2017 19:15:19 -0400 Subject: [PATCH 20/94] amid sample subclass overhaul --- looper/looper.py | 74 +++-- looper/models.py | 741 ++++++++++++++++++++++++++--------------------- 2 files changed, 439 insertions(+), 376 deletions(-) diff --git a/looper/looper.py b/looper/looper.py index 8ba8ad9c..2bc32985 100755 --- a/looper/looper.py +++ b/looper/looper.py @@ -18,14 +18,10 @@ from .utils import VersionInHelpParser try: - from .models import \ - InterfaceManager, PipelineInterface, \ - ProtocolMapper + from .models import PipelineInterface, ProtocolMapper except: sys.path.append(os.path.join(os.path.dirname(__file__), "looper")) - from models import \ - InterfaceManager, PipelineInterface, \ - ProtocolMapper + from models import PipelineInterface, ProtocolMapper from colorama import init init() @@ -131,7 +127,7 @@ def parse_arguments(): destroy_subparser, check_subparser, clean_subparser]: subparser.add_argument( "config_file", - help="Project YAML config file.") + help="Project configuration file (YAML).") subparser.add_argument( "--file-checks", action="store_false", @@ -140,11 +136,12 @@ def parse_arguments(): "-d", "--dry-run", action="store_true", - help="Don't actually submit.") + help="Don't actually submit the project/subproject.") subparser.add_argument( "--sp", dest="subproject", - help="Supply subproject") + help="Name of subproject to use, as designated in the " + "project's configuration file") # To enable the loop to pass args directly on to the pipelines... args, remaining_args = parser.parse_known_args() @@ -175,7 +172,7 @@ def parse_arguments(): -def run(prj, args, remaining_args, interface_manager): +def run(prj, args, remaining_args): """ Main Looper function: Submit jobs for samples in project. @@ -184,15 +181,13 @@ def run(prj, args, remaining_args, interface_manager): :param Iterable[str] remaining_args: arguments given to this module's parser that were not defined as options it should parse, to be passed on to parser(s) elsewhere - :param InterfaceManager interface_manager: aggregator and manager of - pipeline interfaces and protocol mappings """ # Easier change later, especially likely for library --> protocol. _read_type = "read_type" _protocol = "library" - _start_counter(len(prj.samples)) + _start_counter(len(prj.samples())) valid_read_types = ["single", "paired"] @@ -204,13 +199,13 @@ def run(prj, args, remaining_args, interface_manager): # Create a problem list so we can keep track and show them at the end. failures = [] - for sample in prj.samples: + for sample in prj.samples(): _LOGGER.debug(sample) _LOGGER.info(_COUNTER.show(sample.sample_name, sample.library)) - pipeline_outfolder = os.path.join( + sample_output_folder = os.path.join( prj.metadata.results_subdir, sample.sample_name) - _LOGGER.debug("Pipeline output folder: '%s'", pipeline_outfolder) + _LOGGER.debug("Sample output folder: '%s'", sample_output_folder) skip_reasons = [] # Don't submit samples with duplicate names. @@ -220,7 +215,8 @@ def run(prj, args, remaining_args, interface_manager): # Check if sample should be run. if hasattr(sample, SAMPLE_EXECUTION_TOGGLE): if sample[SAMPLE_EXECUTION_TOGGLE] != "1": - skip_reasons.append("Column '{}' deselected".format(SAMPLE_EXECUTION_TOGGLE)) + skip_reasons.append("Column '{}' deselected". + format(SAMPLE_EXECUTION_TOGGLE)) # Check if single_or_paired value is recognized. if hasattr(sample, _read_type): @@ -234,7 +230,7 @@ def run(prj, args, remaining_args, interface_manager): # Get the base protocol-to-pipeline mappings if hasattr(sample, _protocol): protocol = sample.library.upper() - pipelines = interface_manager.build_pipelines(protocol) + pipelines = prj.build_pipelines(protocol) if len(pipelines) == 0: skip_reasons.append( "No pipeline found for protocol {}".format(protocol)) @@ -368,7 +364,7 @@ def run(prj, args, remaining_args, interface_manager): submitted = cluster_submit( sample, prj.compute.submission_template, prj.compute.submission_command, submit_settings, - prj.metadata.submission_subdir, pipeline_outfolder, + prj.metadata.submission_subdir, sample_output_folder, pl_name, args.time_delay, submit=True, dry_run=args.dry_run, ignore_flags=args.ignore_flags, remaining_args=remaining_args) @@ -422,11 +418,11 @@ def summarize(prj): columns = [] stats = [] - _start_counter(len(prj.samples)) + _start_counter(len(prj.samples())) - for sample in prj.samples: + for sample in prj.samples(): _LOGGER.info(_COUNTER.show(sample.sample_name, sample.library)) - pipeline_outfolder = os.path.join( + sample_output_folder = os.path.join( prj.metadata.results_subdir, sample.sample_name) # Grab the basic info from the annotation sheet for this sample. @@ -434,7 +430,7 @@ def summarize(prj): sample_stats = sample.get_sheet_dict() columns.extend(sample_stats.keys()) # Version 0.3 standardized all stats into a single file - stats_file = os.path.join(pipeline_outfolder, "stats.tsv") + stats_file = os.path.join(sample_output_folder, "stats.tsv") if os.path.isfile(stats_file): _LOGGER.info("Found stats file: '%s'", stats_file) else: @@ -484,17 +480,17 @@ def destroy(prj, args, preview_flag=True): _LOGGER.info("Results to destroy:") - _start_counter(len(prj.samples)) + _start_counter(len(prj.samples())) - for sample in prj.samples: + for sample in prj.samples(): _LOGGER.info(_COUNTER.show(sample.sample_name, sample.library)) - pipeline_outfolder = os.path.join( + sample_output_folder = os.path.join( prj.metadata.results_subdir, sample.sample_name) if preview_flag: # Preview: Don't actually delete, just show files. - _LOGGER.info(str(pipeline_outfolder)) + _LOGGER.info(str(sample_output_folder)) else: - destroy_sample_results(pipeline_outfolder, args) + destroy_sample_results(sample_output_folder, args) if not preview_flag: _LOGGER.info("Destroy complete.") @@ -521,13 +517,13 @@ def clean(prj, args, preview_flag=True): _LOGGER.info("Files to clean:") - _start_counter(len(prj.samples)) + _start_counter(len(prj.samples())) - for sample in prj.samples: + for sample in prj.samples(): _LOGGER.info(_COUNTER.show(sample.sample_name, sample.library)) - pipeline_outfolder = os.path.join(prj.metadata.results_subdir, - sample.sample_name) - cleanup_files = glob.glob(os.path.join(pipeline_outfolder, + sample_output_folder = os.path.join( + prj.metadata.results_subdir, sample.sample_name) + cleanup_files = glob.glob(os.path.join(sample_output_folder, "*_cleanup.sh")) if preview_flag: # Preview: Don't actually clean, just show what will be cleaned. @@ -597,7 +593,7 @@ def _submission_status_text(curr, total, sample_name, sample_library): def cluster_submit( sample, submit_template, submission_command, variables_dict, - submission_folder, pipeline_outfolder, pipeline_name, time_delay, + submission_folder, sample_output_folder, pipeline_name, time_delay, submit=False, dry_run=False, ignore_flags=False, remaining_args=None): """ Submit job to cluster manager. @@ -610,7 +606,7 @@ def cluster_submit( the submission template :param str submission_folder: path to the folder in which to place submission files - :param str pipeline_outfolder: path to folder into which the pipeline + :param str sample_output_folder: path to folder into which the pipeline will write file(s), and where to search for flag file to check if a sample's already been submitted :param str pipeline_name: name of the pipeline that the job will run @@ -660,7 +656,7 @@ def cluster_submit( # Check if job is already submitted (unless ignore_flags is set to True) if not ignore_flags: flag_files = glob.glob(os.path.join( - pipeline_outfolder, pipeline_name + "*.flag")) + sample_output_folder, pipeline_name + "*.flag")) if len(flag_files) > 0: flags = [os.path.basename(f) for f in flag_files] _LOGGER.info("> Not submitting, flag(s) found: {}".format(flags)) @@ -791,7 +787,6 @@ def main(): args.config_file, args.subproject, file_checks=args.file_checks, compute_env_file=getattr(args, 'env', None)) - prj.finalize_pipelines_directory() _LOGGER.info("Results subdir: " + prj.metadata.results_subdir) @@ -807,14 +802,13 @@ def main(): raise AttributeError( "Looper requires at least one pipeline(s) location.") - interface_manager = InterfaceManager(prj.metadata.pipelines_dir) - if not interface_manager.ifproto_by_proto_name: + if not prj.interface_manager.ifproto_by_proto_name: _LOGGER.error( "The interface manager is empty. Does your project point " "to at least one pipelines location that exists?") return try: - run(prj, args, remaining_args, interface_manager=interface_manager) + run(prj, args, remaining_args) except IOError: _LOGGER.error("{} pipelines_dir: '{}'".format( prj.__class__.__name__, prj.metadata.pipelines_dir)) diff --git a/looper/models.py b/looper/models.py index d415d66f..8241d28d 100644 --- a/looper/models.py +++ b/looper/models.py @@ -106,8 +106,11 @@ def is_url(maybe_url): class Paths(object): """ A class to hold paths as attributes. """ - def __str__(self): - return "Paths object." + def __getitem__(self, key): + """ + Provides dict-style access to attributes + """ + return getattr(self, key) def __iter__(self): """ @@ -120,11 +123,8 @@ def __iter__(self): """ return iter(self.__dict__.values()) - def __getitem__(self, key): - """ - Provides dict-style access to attributes - """ - return getattr(self, key) + def __str__(self): + return "Paths object." @@ -307,6 +307,26 @@ def __repr__(self): +def process_pipeline_interfaces(pipeline_interface_locations): + """ + + :param pipeline_interface_locations: + :return: + """ + ifproto_by_proto_name = defaultdict(list) + for pipe_iface_location in pipeline_interface_locations: + if not _os.path.exists(pipe_iface_location): + _LOGGER.warn("Ignoring nonexistent pipeline interface " + "location '%s'", pipe_iface_location) + continue + proto_iface = ProtocolInterface(pipe_iface_location) + for proto_name in proto_iface.protomap: + _LOGGER.debug("Protocol name: {}".format(proto_name)) + ifproto_by_proto_name[proto_name].append(proto_iface) + return ifproto_by_proto_name + + + @copy class Project(AttributeDict): """ @@ -362,8 +382,6 @@ def __init__(self, config_file, subproject=None, self.__class__.__name__, config_file) super(Project, self).__init__() - default_compute = default_compute or self.default_cmpenv_file - # Initialize local, serial compute as default (no cluster submission) # Start with default environment settings. _LOGGER.debug("Establishing default environment settings") @@ -435,36 +453,64 @@ def __init__(self, config_file, subproject=None, except AttributeError: self.derived_columns = self.DERIVED_COLUMNS_DEFAULT - # Sheet will be set to non-null value by call to add_sample_sheet(). - # That call also sets the samples (list) attribute for the instance - # and adds default derived columns. - self.sheet = None - self.samples = list() + # SampleSheet creation populates project's samples, adds the + # sheet itself, and adds any derived columns. + self.interfaces_by_protocol = \ + process_pipeline_interfaces(self.metadata.pipelines_dir) + self.sheet = check_sheet(self.metadata.sample_annotation) + self.finalize_pipelines_directory() - self.add_sample_sheet() + # Defer Sample creation until needed. + self._samples_by_pipeline = {} @property - def default_cmpenv_file(self): + def compute_env_var(self): + """ + Environment variable through which to access compute settings. + + :return str: name of the environment variable to pointing to + compute settings + """ + return COMPUTE_SETTINGS_VARNAME + + + @property + def default_compute_envfile(self): """ Path to default compute environment settings file. """ return _os.path.join( self.templates_folder, "default_compute_settings.yaml") @property - def templates_folder(self): - return _os.path.join(_os.path.dirname(__file__), "submit_templates") + def output_dir(self): + """ + Directory in which to place results and submissions folders. + + By default, assume that the project's configuration file specifies + an output directory, and that this is therefore available within + the project metadata. If that assumption does not hold, though, + consider the folder in which the project configuration file lives + to be the project's output directory. + + :return str: path to the project's output directory, either as + specified in the configuration file or the folder that contains + the project's configuration file. + """ + try: + return self.metadata.output_dir + except AttributeError: + return _os.path.dirname(self.config_file) @property - def compute_env_var(self): + def project_folders(self): """ - Environment variable through which to access compute settings. - - :return str: name of the environment variable to pointing to - compute settings + Names of folders to nest within a project output directory. + + :return Iterable[str]: names of output-nested folders """ - return COMPUTE_SETTINGS_VARNAME + return ["results_subdir", "submission_subdir"] @property @@ -483,34 +529,51 @@ def required_metadata(self): @property - def project_folders(self): - """ - Names of folders to nest within a project output directory. - - :return Iterable[str]: names of output-nested folders - """ - return ["results_subdir", "submission_subdir"] + def sample_names(self): + """ Names of samples of which this Project is aware. """ + return iter(self.sheet[SAMPLE_NAME_COLNAME]) + @property - def output_dir(self): + def samples(self): + if self._samples is None: + self.create_base_samples() + return self._samples + + + def samples_for(self, pipeline=None): """ - Directory in which to place results and submissions folders. - - By default, assume that the project's configuration file specifies - an output directory, and that this is therefore available within - the project metadata. If that assumption does not hold, though, - consider the folder in which the project configuration file lives - to be the project's output directory. - - :return str: path to the project's output directory, either as - specified in the configuration file or the folder that contains - the project's configuration file. + Fetch each of this Project's Sample objects for given pipeline. + + If Sample objects have not yet been created for this Project, create + them as needed. What is needed and what is returned are determined by + the argument to the pipeline parameter. If no argument is provided as + the desired pipeline, it's assumed that the desire is for all possible + Sample objects to be created and return based on the samples that and + pipelines of which this Project is aware, and the relationships + between them. + + :param str pipeline: name of pipeline for which to fetch samples + :return Iterable[Sample]: """ - try: - return self.metadata.output_dir - except AttributeError: - return _os.path.dirname(self.config_file) + if not self._samples_by_pipeline: + # TODO: also ensure that case of unknown pipeline is covered here. + self.create_samples(pipeline) + if pipeline is None: + return list(itertools.chain(*self._samples_by_pipeline.values())) + else: + try: + return self._samples_by_pipeline[pipeline] + except KeyError: + _LOGGER.error("Unknown pipeline: '%s'", pipeline) + return [] + + + @property + def templates_folder(self): + """ Path to folder with default submission templates. """ + return _os.path.join(_os.path.dirname(__file__), "submit_templates") @staticmethod @@ -529,27 +592,112 @@ def infer_name(path_config_file): return config_folder - def _handle_missing_env_attrs(self, env_settings_file, when_missing): - """ Default environment settings aren't required; warn, though. """ - missing_env_attrs = \ - [attr for attr in ["environment", "environment_file"] - if not hasattr(self, attr) or getattr(self, attr) is None] - if not missing_env_attrs: + def build_pipelines(self, protocol, priority=True): + """ + + + :param str protocol: + :param bool priority: + :return: + """ + + try: + protocol_interfaces = self.interfaces_by_protocol[protocol] + except KeyError: + _LOGGER.warn("Unknown protocol: '{}'".format(protocol)) + return [] + + jobs = [] + pipeline_keys_used = set() + _LOGGER.debug("Building pipelines for {} PIs...". + format(len(protocol_interfaces))) + for proto_iface in protocol_interfaces: + try: + this_protocol_pipelines = \ + proto_iface.protomap.mappings[protocol] + except KeyError: + _LOGGER.debug("Protocol '%s' lacks a mapping", protocol) + continue + + # TODO: update once dependency-encoding logic is in place. + _LOGGER.debug("Protocol: {}".format(protocol)) + pipeline_keys = this_protocol_pipelines.replace(";", ",")\ + .strip(" ()\n")\ + .split(",") + pipeline_keys = [pk.strip() for pk in pipeline_keys] + already_mapped, new_scripts = \ + partition(pipeline_keys, + partial(_is_member, items=pipeline_keys_used)) + pipeline_keys_used |= set(pipeline_keys) + + # Attempt to validate that partition yielded disjoint subsets. + try: + disjoint_partition_violation = \ + set(already_mapped) & set(new_scripts) + except TypeError: + _LOGGER.debug("Cannot validate partition") + else: + assert not disjoint_partition_violation, \ + "Partitioning {} with membership in {} as " \ + "predicate produced intersection: {}".format( + pipeline_keys, pipeline_keys_used, + disjoint_partition_violation) + + _LOGGER.debug("Skipping {} already-mapped script names: {}". + format(len(already_mapped), + ", ".join(already_mapped))) + _LOGGER.debug("{} new scripts for protocol {} from " + "pipelines warehouse '{}': {}". + format(len(new_scripts), protocol, + proto_iface.pipedir, ", ".join(new_scripts))) + + jobs.append([(proto_iface.interface, ) + + proto_iface.pipeline_key_to_path(pipeline_key) + for pipeline_key in pipeline_keys]) + + return jobs[0] if priority and len(jobs) > 1 else \ + list(itertools.chain(*jobs)) + + + def create_base_samples(self): + pass + + + def create_samples(self, pipeline=None): + """ + Create Samples for this Project, for a particular pipeline if given. + + If Sample(s) already exist for the given pipeline, nothing is done. + This ensures creation work isn't duplicated but assumes that samples + are not added to a project after it's constructed, or at least not + for a type for which Sample objects have already been created. + + :param str pipeline: name of pipeline for which to create Samples + """ + if self.merge_table is None: + try: + if _os.path.isfile(self.metadata.merge_table): + self.merge_table = _pd.read_table( + self.metadata.merge_table, + sep=None, engine="python") + else: + _LOGGER.debug("Alleged merge_table file does not exist: " + "'%s'", self.metadata.merge_table) + except AttributeError: + _LOGGER.debug("No merge table") + if pipeline in self._samples_by_pipeline: + _LOGGER.debug("Sample(s) already exist for pipeline '%s'", + pipeline) return - message = "'{}' lacks environment attributes: {}".\ - format(env_settings_file, missing_env_attrs) - if when_missing is None: - _LOGGER.warn(message) - else: - when_missing(message) + def finalize_pipelines_directory(self, pipe_path=""): """ Finalize the establishment of a path to this project's pipelines. - - With the passed argument, override anything already set. - Otherwise, prefer path provided in this project's config, then + + With the passed argument, override anything already set. + Otherwise, prefer path provided in this project's config, then local pipelines folder, then a location set in project environment. :param str pipe_path: (absolute) path to pipelines @@ -585,6 +733,64 @@ def finalize_pipelines_directory(self, pipe_path=""): self.metadata.pipelines_dir = pipe_path + def get_arg_string(self, pipeline_name): + """ + For this project, given a pipeline, return an argument string + specified in the project config file. + """ + + def make_optarg_text(opt, arg): + """ Transform flag/option into CLI-ready text version. """ + return "{} {}".format(opt, _os.path.expandvars(arg)) \ + if arg else opt + + def create_argtext(name): + """ Create command-line argstring text from config section. """ + try: + optargs = getattr(self.pipeline_args, name) + except AttributeError: + return "" + # NS using __dict__ will add in the metadata from AttrDict (doh!) + _LOGGER.debug("optargs.items(): {}".format(optargs.items())) + optargs_texts = [make_optarg_text(opt, arg) + for opt, arg in optargs.items()] + _LOGGER.debug("optargs_texts: {}".format(optargs_texts)) + # TODO: may need to fix some spacing issues here. + return " ".join(optargs_texts) + + default_argtext = create_argtext(DEFAULT_COMPUTE_RESOURCES_NAME) + pipeline_argtext = create_argtext(pipeline_name) + + if not pipeline_argtext: + # The project config may not have an entry for this pipeline; + # no problem! There are no pipeline-specific args. Return text + # from default arguments, whether empty or not. + return default_argtext + elif default_argtext: + # Non-empty pipeline-specific and default argtext + return " ".join([default_argtext, pipeline_argtext]) + else: + # No default argtext, but non-empty pipeline-specific argtext + return pipeline_argtext + + + def make_project_dirs(self): + """ + Creates project directory structure if it doesn't exist. + """ + for folder_name in self.project_folders: + folder_path = self.metadata[folder_name] + _LOGGER.debug("Ensuring project dir exists: '%s'", folder_path) + if not _os.path.exists(folder_path): + _LOGGER.debug("Attempting to create project folder: '%s'", + folder_path) + try: + _os.makedirs(folder_path) + except OSError as e: + _LOGGER.warn("Could not create project folder: '%s'", + str(e)) + + def parse_config_file(self, subproject=None): """ Parse provided yaml config file and check required fields exist. @@ -724,92 +930,13 @@ def parse_config_file(self, subproject=None): path_config_file=self.config_file) - def _ensure_absolute(self, maybe_relpath): - _LOGGER.debug("Ensuring absolute: '%s'", maybe_relpath) - if _os.path.isabs(maybe_relpath) or is_url(maybe_relpath): - _LOGGER.debug("Already absolute") - return maybe_relpath - # Maybe we have env vars that make the path absolute? - expanded = _os.path.expandvars(maybe_relpath) - _LOGGER.debug("Expanded: '%s'", expanded) - if _os.path.isabs(expanded): - _LOGGER.debug("Expanded is absolute") - return expanded - _LOGGER.debug("Making non-absolute path '%s' be absolute", - maybe_relpath) - # Set path to an absolute path, relative to project config. - config_dirpath = _os.path.dirname(self.config_file) - _LOGGER.debug("config_dirpath: %s", config_dirpath) - abs_path = _os.path.join(config_dirpath, maybe_relpath) - return abs_path + def pipelines_by_sample(self): + pass - def update_environment(self, env_settings_file): - """ - Parse data from environment configuration file. - - :param str env_settings_file: path to file with - new environment configuration data - """ - if not env_settings_file: - return - - with open(env_settings_file, 'r') as handle: - _LOGGER.info("Loading %s: %s", - self.compute_env_var, env_settings_file) - env_settings = yaml.load(handle) - _LOGGER.debug("Parsed environment settings: %s", - str(env_settings)) - - # Any compute.submission_template variables should be made - # absolute, relative to current environment settings file. - y = env_settings["compute"] - for key, value in y.items(): - if type(y[key]) is dict: - for key2, value2 in y[key].items(): - if key2 == "submission_template": - if not _os.path.isabs(y[key][key2]): - y[key][key2] = _os.path.join( - _os.path.dirname(env_settings_file), - y[key][key2]) - - env_settings["compute"] = y - if self.environment is None: - self.environment = AttributeDict(env_settings) - else: - self.environment.add_entries(env_settings) - - self.environment_file = env_settings_file - - - def make_project_dirs(self): - """ - Creates project directory structure if it doesn't exist. - """ - for folder_name in self.project_folders: - folder_path = self.metadata[folder_name] - _LOGGER.debug("Ensuring project dir exists: '%s'", folder_path) - if not _os.path.exists(folder_path): - _LOGGER.debug("Attempting to create project folder: '%s'", - folder_path) - try: - _os.makedirs(folder_path) - except OSError as e: - _LOGGER.warn("Could not create project folder: '%s'", - str(e)) - + def samples_by_pipeline(self): + pass - def set_project_permissions(self): - """ - Makes the project's public_html folder executable. - """ - for d in [self.trackhubs.trackhub_dir]: - try: - _os.chmod(d, 0o0755) - except OSError: - # This currently does not fail now - # ("cannot change folder's mode: %s" % d) - continue def set_compute(self, setting): @@ -854,45 +981,52 @@ def set_compute(self, setting): return False - def get_arg_string(self, pipeline_name): + def set_project_permissions(self): """ - For this project, given a pipeline, return an argument string - specified in the project config file. + Make the project's public_html folder executable. """ + try: + _os.chmod(self.trackhubs.trackhub_dir, 0o0755) + except OSError: + # This currently does not fail now + # ("cannot change folder's mode: %s" % d) + pass - def make_optarg_text(opt, arg): - """ Transform flag/option into CLI-ready text version. """ - return "{} {}".format(opt, _os.path.expandvars(arg)) \ - if arg else opt - def create_argtext(name): - """ Create command-line argstring text from config section. """ - try: - optargs = getattr(self.pipeline_args, name) - except AttributeError: - return "" - # NS using __dict__ will add in the metadata from AttrDict (doh!) - _LOGGER.debug("optargs.items(): {}".format(optargs.items())) - optargs_texts = [make_optarg_text(opt, arg) - for opt, arg in optargs.items()] - _LOGGER.debug("optargs_texts: {}".format(optargs_texts)) - # TODO: may need to fix some spacing issues here. - return " ".join(optargs_texts) + def update_environment(self, env_settings_file): + """ + Parse data from environment configuration file. - default_argtext, pipeline_argtext = \ - create_argtext(DEFAULT_COMPUTE_RESOURCES_NAME), create_argtext(pipeline_name) + :param str env_settings_file: path to file with + new environment configuration data + """ - if not pipeline_argtext: - # The project config may not have an entry for this pipeline; - # no problem! There are no pipeline-specific args. Return text - # from default arguments, whether empty or not. - return default_argtext - elif default_argtext: - # Non-empty pipeline-specific and default argtext - return " ".join([default_argtext, pipeline_argtext]) - else: - # No default argtext, but non-empty pipeline-specific argtext - return pipeline_argtext + with open(env_settings_file or self.default_compute_envfile, 'r') as f: + _LOGGER.info("Loading %s: %s", + self.compute_env_var, env_settings_file) + env_settings = yaml.load(f) + _LOGGER.debug("Parsed environment settings: %s", + str(env_settings)) + + # Any compute.submission_template variables should be made + # absolute, relative to current environment settings file. + y = env_settings["compute"] + for key, value in y.items(): + if type(y[key]) is dict: + for key2, value2 in y[key].items(): + if key2 == "submission_template": + if not _os.path.isabs(y[key][key2]): + y[key][key2] = _os.path.join( + _os.path.dirname(env_settings_file), + y[key][key2]) + + env_settings["compute"] = y + if self.environment is None: + self.environment = AttributeDict(env_settings) + else: + self.environment.add_entries(env_settings) + + self.environment_file = env_settings_file def add_sample_sheet(self, csv=None, sample_builder=None): @@ -902,6 +1036,8 @@ def add_sample_sheet(self, csv=None, sample_builder=None): :param csv: Path to csv file. :type csv: str + :param sample_builder: how to create single Sample from raw input data. + :type sample_builder: function(pandas.Series | dict) -> Sample """ _LOGGER.debug("Adding sample sheet") @@ -917,11 +1053,13 @@ def add_sample_sheet(self, csv=None, sample_builder=None): _LOGGER.debug("Creating samples from annotation sheet") self.sheet.make_samples(sample_builder) - # Add samples to Project + # Add samples to Project. for sample in self.sheet.samples: # Overwritten later if merged sample.merged = False - self.add_sample(sample) # Appends sample to self.samples. + # Tie sample and project bilaterally + sample.prj = self + self.samples.append(sample) # Merge sample files (!) using merge table if provided: if hasattr(self.metadata, "merge_table"): @@ -1027,19 +1165,62 @@ def add_sample_sheet(self, csv=None, sample_builder=None): "data path assignment", sample.sample_name) - def add_sample(self, sample): - """ - Adds a sample to the project's `samples`. - """ - # Check sample is Sample object - if not isinstance(sample, Sample): - raise TypeError("Provided object is not a Sample object.") + def _ensure_absolute(self, maybe_relpath): + """ Ensure that a possibly relative path is absolute. """ + _LOGGER.debug("Ensuring absolute: '%s'", maybe_relpath) + if _os.path.isabs(maybe_relpath) or is_url(maybe_relpath): + _LOGGER.debug("Already absolute") + return maybe_relpath + # Maybe we have env vars that make the path absolute? + expanded = _os.path.expandvars(maybe_relpath) + _LOGGER.debug("Expanded: '%s'", expanded) + if _os.path.isabs(expanded): + _LOGGER.debug("Expanded is absolute") + return expanded + _LOGGER.debug("Making non-absolute path '%s' be absolute", + maybe_relpath) + # Set path to an absolute path, relative to project config. + config_dirpath = _os.path.dirname(self.config_file) + _LOGGER.debug("config_dirpath: %s", config_dirpath) + abs_path = _os.path.join(config_dirpath, maybe_relpath) + return abs_path + + + def _handle_missing_env_attrs(self, env_settings_file, when_missing): + """ Default environment settings aren't required; warn, though. """ + missing_env_attrs = \ + [attr for attr in ["environment", "environment_file"] + if not hasattr(self, attr) or getattr(self, attr) is None] + if not missing_env_attrs: + return + message = "'{}' lacks environment attributes: {}".\ + format(env_settings_file, missing_env_attrs) + if when_missing is None: + _LOGGER.warn(message) + else: + when_missing(message) + + - # Tie sample and project bilaterally - sample.prj = self - # Append - self.samples.append(sample) +def check_sheet(sample_file, dtype=str): + """ + Check if csv file exists and has all required columns. + + :param str sample_file: path to sample annotations file. + :param type dtype: data type for CSV read. + :raises IOError: if given annotations file can't be read. + :raises ValueError: if required column(s) is/are missing. + """ + df = _pd.read_table(sample_file, sep=None, dtype=dtype, + index_col=False, engine="python") + req = [SAMPLE_NAME_COLNAME] + missing = set(req) - set(df.columns) + if len(missing) != 0: + raise ValueError( + "Annotation sheet ('{}') is missing column(s): {}; has: {}". + format(sample_file, missing, df.columns)) + return df @copy @@ -1075,28 +1256,6 @@ def __repr__(self): return "SampleSheet with %i samples." % len(self.df) - @staticmethod - def check_sheet(sample_file, dtype): - """ - Check if csv file exists and has all required columns. - - :param str sample_file: path to sample annotations file. - :param type dtype: data type for CSV read. - :raises IOError: if given annotations file can't be read. - :raises ValueError: if required column(s) is/are missing. - """ - - df = _pd.read_table(sample_file, sep=None, dtype=dtype, - index_col=False, engine="python") - req = [SAMPLE_NAME_COLNAME] - missing = set(req) - set(df.columns) - if len(missing) != 0: - raise ValueError( - "Annotation sheet ('{}') is missing column(s): {}; has: {}". - format(sample_file, missing, df.columns)) - return df - - @staticmethod def alpha_cased(text, lower=False): """ @@ -1180,10 +1339,17 @@ def make_sample(data): for sample_type, sample_class in sample_types} def make_sample(data): + # Create the most specific Sample type possible. + try: + protocol = data.library + except AttributeError: + _LOGGER.debug("Sample data lacks 'library' attribute") + return Sample(data) try: - return pairing[self.alpha_cased(data.library)](data) - except (AttributeError, KeyError): - _LOGGER.debug("Error making child subclass") + return pairing[self.alpha_cased(protocol)](data) + except KeyError: + _LOGGER.debug("Unknown protocol: '{}'; known: {}". + format(protocol, pairing.keys())) return Sample(data) return make_sample @@ -2107,129 +2273,30 @@ def _select_pipeline(self, pipeline_name): -@copy -class InterfaceManager(object): - """ Manage pipeline use for multiple locations and protocols. - - This is done by aggregating protocol interface instances, - allowing one Project to use pipelines from multiple locations. - - :param pipeline_dirs: locations containing pipelines and configuration - information; specifically, a directory with a 'pipelines' folder and - a 'config' folder, within which there is a pipeline interface file - and a protocol mappings file. - :type pipeline_dirs: Iterable[str] - - """ - def __init__(self, pipeline_dirs): - # Collect interface/mappings pairs by protocol name. - interfaces_and_protocols = \ - [ProtocolInterfaces(pipedir) for pipedir in pipeline_dirs - if _os.path.exists(pipedir)] - self.ifproto_by_proto_name = defaultdict(list) - for ifproto in interfaces_and_protocols: - for proto_name in ifproto.protomap: - _LOGGER.debug("Protocol name: {}".format(proto_name)) - self.ifproto_by_proto_name[proto_name].append(ifproto) - - - def build_pipelines(self, protocol_name, priority=True): - """ - Build up a sequence of scripts to execute for this protocol. - - :param str protocol_name: name for the protocol for which to build - pipelines - :param bool priority: should only the top priority mapping be used? - :return Sequence[(PipelineInterface, str, str)]: sequence of jobs - (script paths) to execute for the given protocol; if priority - flag is set (as is the default), this is a single-element list, - the sequence of jobs built is interpreted as descending priority - """ - - try: - ifprotos = self.ifproto_by_proto_name[protocol_name] - except KeyError: - _LOGGER.warn("Unknown protocol: '{}'".format(protocol_name)) - return [] - - jobs = [] - pipeline_keys_used = set() - _LOGGER.debug("Building pipelines for {} PIs...".format(len(ifprotos))) - for ifproto in ifprotos: - try: - this_protocol_pipelines = \ - ifproto.protomap.mappings[protocol_name] - except KeyError: - _LOGGER.debug("Protocol {} missing mapping in '{}'". - format(protocol_name, ifproto.protomaps_path)) - else: - # TODO: update once dependency-encoding logic is in place. - _LOGGER.debug("Protocol: {}".format(protocol_name)) - pipeline_keys = this_protocol_pipelines.replace(";", ",")\ - .strip(" ()\n")\ - .split(",") - pipeline_keys = [pk.strip() for pk in pipeline_keys] - already_mapped, new_scripts = \ - partition(pipeline_keys, - partial(_is_member, items=pipeline_keys_used)) - pipeline_keys_used |= set(pipeline_keys) - - if len(pipeline_keys) != (len(already_mapped) + len(new_scripts)): - _LOGGER.error("{} --> {} + {}".format( - pipeline_keys, already_mapped, new_scripts)) - - raise RuntimeError( - "Partitioned {} script names into allegedly " - "disjoint sets of {} and {} elements.". - format(len(pipeline_keys), - len(already_mapped), - len(new_scripts))) - - _LOGGER.debug("Skipping {} already-mapped script names: {}". - format(len(already_mapped), - ", ".join(already_mapped))) - _LOGGER.debug("{} new scripts for protocol {} from " - "pipelines warehouse '{}': {}". - format(len(new_scripts), protocol_name, - ifproto.pipedir, ", ".join(new_scripts))) - - jobs.append([(ifproto.interface, ) + - ifproto.pipeline_key_to_path(pipeline_key) - for pipeline_key in pipeline_keys]) - - return jobs[0] if priority and len(jobs) > 1 else list(itertools.chain(*jobs)) - - - -def _is_member(item, items): - return item in items - - - -# TODO: rename. -class ProtocolInterfaces: +class ProtocolInterface(object): """ PipelineInterface and ProtocolMapper for a single pipelines location. - Instances of this class are used by InterfaceManager to facilitate - multi-location pipelines use by a single project. Here also are stored - path attributes to retain information about the location from which the - interface and mapper came. + This class facilitates use of pipelines from multiple locations by a + single project. Also stored are path attributes with information about + the location(s) from which the PipelineInterface and ProtocolMapper came. :param pipedir: location (e.g., code repository) of pipelines :type pipedir: str """ def __init__(self, pipedir): + + super(ProtocolInterface, self).__init__() + if _os.path.isdir(pipedir): self.pipedir = pipedir self.config_path = _os.path.join(pipedir, "config") - self.interface_path = _os.path.join(self.config_path, - "pipeline_interface.yaml") - self.protomaps_path = _os.path.join(self.config_path, - "protocol_mappings.yaml") - self.interface = PipelineInterface(self.interface_path) - self.protomap = ProtocolMapper(self.protomaps_path) + self.interface = PipelineInterface(_os.path.join( + self.config_path, "pipeline_interface.yaml")) + self.protomap = ProtocolMapper(_os.path.join( + self.config_path, "protocol_mappings.yaml")) self.pipelines_path = _os.path.join(pipedir, "pipelines") + elif _os.path.isfile(pipedir): # Secondary version that passes combined yaml file directly, # instead of relying on separate hard-coded config names as above @@ -2254,6 +2321,7 @@ def __init__(self, pipedir): except Exception as e: _LOGGER.error(str(iface)) raise e + else: raise ValueError("Alleged pipelines location '{}' exists neither " "as a file nor as a folder.".format(pipedir)) @@ -2270,8 +2338,9 @@ def pipeline_key_to_path(self, pipeline_key): absolute path for pipeline script. """ - # key may contain extra command-line flags; split key from flags. + # The key may contain extra command-line flags; split key from flags. + # The strict key is the script name itself, something like "ATACseq.py" strict_pipeline_key, _, pipeline_key_args = pipeline_key.partition(' ') if self.interface.get_attribute(strict_pipeline_key, "path"): @@ -2309,13 +2378,10 @@ class ProtocolMapper(Mapping): """ def __init__(self, mappings_input): if isinstance(mappings_input, Mapping): - # Pre-parsed mappings data - self.mappings_file = None mappings = mappings_input else: # Parse file mapping protocols to pipeline(s). - self.mappings_file = mappings_input - with open(self.mappings_file, 'r') as mapfile: + with open(mappings_input, 'r') as mapfile: mappings = yaml.load(mapfile) self.mappings = {k.upper(): v for k, v in mappings.items()} @@ -2348,7 +2414,6 @@ def build_pipeline(self, protocol): self.parse_parallel_jobs(split_jobs[i], split_jobs[i - 1]) """ - # TODO: incorporate into the InterfaceManager? def parse_parallel_jobs(self, job, dep): job = job.replace("(", "").replace(")", "") split_jobs = [x.strip() for x in job.split(',')] @@ -2358,13 +2423,12 @@ def parse_parallel_jobs(self, job, dep): else: self.register_job(job, dep) - # TODO: incorporate into InterfaceManager? def register_job(self, job, dep): _LOGGER.info("Register Job Name: %s\tDep: %s", str(job), str(dep)) - def __getitem__(self, item): - return self.mappings[item] + def __getitem__(self, protocol_name): + return self.mappings[protocol_name] def __iter__(self): return iter(self.mappings) @@ -2422,3 +2486,8 @@ class _MissingPipelineConfigurationException(Exception): """ A selected pipeline needs configuration data. """ def __init__(self, pipeline): super(_MissingPipelineConfigurationException, self).__init__(pipeline) + + + +def _is_member(item, items): + return item in items From e3583839fba948ad3776352b0541e7052e77ad63 Mon Sep 17 00:00:00 2001 From: Vince Reuter Date: Sat, 17 Jun 2017 02:17:03 -0400 Subject: [PATCH 21/94] more work on Sample subclass support --- looper/looper.py | 36 ++++++++++++++++-------------------- looper/models.py | 23 ++++++++++++++--------- 2 files changed, 30 insertions(+), 29 deletions(-) diff --git a/looper/looper.py b/looper/looper.py index 2bc32985..2353d642 100755 --- a/looper/looper.py +++ b/looper/looper.py @@ -183,12 +183,7 @@ def run(prj, args, remaining_args): to be passed on to parser(s) elsewhere """ - # Easier change later, especially likely for library --> protocol. - _read_type = "read_type" - _protocol = "library" - - _start_counter(len(prj.samples())) - + _start_counter(len(prj.samples)) valid_read_types = ["single", "paired"] # Keep track of how many jobs have been submitted. @@ -199,7 +194,7 @@ def run(prj, args, remaining_args): # Create a problem list so we can keep track and show them at the end. failures = [] - for sample in prj.samples(): + for sample in prj.samples: _LOGGER.debug(sample) _LOGGER.info(_COUNTER.show(sample.sample_name, sample.library)) @@ -219,24 +214,25 @@ def run(prj, args, remaining_args): format(SAMPLE_EXECUTION_TOGGLE)) # Check if single_or_paired value is recognized. - if hasattr(sample, _read_type): + if hasattr(sample, "read_type"): # Drop "-end", "_end", or just "end" from end of the column value. sample.read_type = re.sub( '[_\\-]?end$', '', str(sample.read_type)).lower() if sample.read_type not in valid_read_types: skip_reasons.append("{} must be in {}".\ - format(_read_type, valid_read_types)) + format("read_type", valid_read_types)) # Get the base protocol-to-pipeline mappings - if hasattr(sample, _protocol): - protocol = sample.library.upper() + try: + protocol = sample.library + except AttributeError: + skip_reasons.append("Missing 'library' attribute") + else: + protocol = protocol.upper() pipelines = prj.build_pipelines(protocol) if len(pipelines) == 0: skip_reasons.append( "No pipeline found for protocol {}".format(protocol)) - else: - skip_reasons.append("Missing '{}' attribute".format(_protocol)) - if skip_reasons: _LOGGER.warn("> Not submitted: {}".format(skip_reasons)) @@ -418,9 +414,9 @@ def summarize(prj): columns = [] stats = [] - _start_counter(len(prj.samples())) + _start_counter(len(prj.samples)) - for sample in prj.samples(): + for sample in prj.samples: _LOGGER.info(_COUNTER.show(sample.sample_name, sample.library)) sample_output_folder = os.path.join( prj.metadata.results_subdir, sample.sample_name) @@ -480,9 +476,9 @@ def destroy(prj, args, preview_flag=True): _LOGGER.info("Results to destroy:") - _start_counter(len(prj.samples())) + _start_counter(len(prj.samples)) - for sample in prj.samples(): + for sample in prj.samples: _LOGGER.info(_COUNTER.show(sample.sample_name, sample.library)) sample_output_folder = os.path.join( prj.metadata.results_subdir, sample.sample_name) @@ -517,9 +513,9 @@ def clean(prj, args, preview_flag=True): _LOGGER.info("Files to clean:") - _start_counter(len(prj.samples())) + _start_counter(len(prj.samples)) - for sample in prj.samples(): + for sample in prj.samples: _LOGGER.info(_COUNTER.show(sample.sample_name, sample.library)) sample_output_folder = os.path.join( prj.metadata.results_subdir, sample.sample_name) diff --git a/looper/models.py b/looper/models.py index 8241d28d..c212c2a6 100644 --- a/looper/models.py +++ b/looper/models.py @@ -534,12 +534,13 @@ def sample_names(self): return iter(self.sheet[SAMPLE_NAME_COLNAME]) - @property def samples(self): - if self._samples is None: - self.create_base_samples() - return self._samples + # TODO: account for merge table; store or re-merge every time? + # TODO: is it more likely to have a bunch of samples, or that + # TODO: use of this and thus the need to re-merge is very frequent? + for _, row in self.sheet.df.iterrows(): + yield Sample(row.dropna()) def samples_for(self, pipeline=None): @@ -600,7 +601,15 @@ def build_pipelines(self, protocol, priority=True): :param bool priority: :return: """ - + + # TODO: called from looper.run; do the import and subclass search here + # TODO: for the search, use something like subprocess with grep for + # TODO: checking for if __name__ == __main__ to determine whether it + # TODO: may run. If so, warn and skip. If not, import with something + # TODO: like imp.load_source, then use the inspect logic to search + # TODO: for Sample subclass(es), using one without __library__ as + # TODO: presumptive default. Determine what to do about specificity. + try: protocol_interfaces = self.interfaces_by_protocol[protocol] except KeyError: @@ -659,10 +668,6 @@ def build_pipelines(self, protocol, priority=True): list(itertools.chain(*jobs)) - def create_base_samples(self): - pass - - def create_samples(self, pipeline=None): """ Create Samples for this Project, for a particular pipeline if given. From cddc40542ae0923ef634037b442c0877694a1fbd Mon Sep 17 00:00:00 2001 From: Vince Reuter Date: Sat, 17 Jun 2017 17:52:25 -0400 Subject: [PATCH 22/94] clarification of some of what's going on in the pipeline construction step --- looper/looper.py | 2 ++ looper/models.py | 35 ++++++++++++++++++++++++++++++----- 2 files changed, 32 insertions(+), 5 deletions(-) diff --git a/looper/looper.py b/looper/looper.py index 2353d642..59138f1f 100755 --- a/looper/looper.py +++ b/looper/looper.py @@ -229,6 +229,8 @@ def run(prj, args, remaining_args): skip_reasons.append("Missing 'library' attribute") else: protocol = protocol.upper() + _LOGGER.debug("Building pipeline(s) for protocol: '{}'". + format(protocol)) pipelines = prj.build_pipelines(protocol) if len(pipelines) == 0: skip_reasons.append( diff --git a/looper/models.py b/looper/models.py index c212c2a6..58b0dbe7 100644 --- a/looper/models.py +++ b/looper/models.py @@ -595,11 +595,20 @@ def infer_name(path_config_file): def build_pipelines(self, protocol, priority=True): """ + Create pipelines to submit for each sample of a particular protocol. + + With the argument (flag) to the priority parameter, there's control + over whether to submit pipeline(s) from only one of the project's + known pipeline locations with a match for the protocol, or whether to + submit pipelines created from all locations with a match for the + protocol. - - :param str protocol: - :param bool priority: - :return: + :param str protocol: name of the protocol/library for which to + create pipeline(s) + :param bool priority: to only submit pipeline(s) from the first of the + pipelines location(s) (indicated in the project config file) that + has a match for the given protocol; optional, default True + :return Iterable[(PipelineInterface, str, str)]: """ # TODO: called from looper.run; do the import and subclass search here @@ -610,17 +619,31 @@ def build_pipelines(self, protocol, priority=True): # TODO: for Sample subclass(es), using one without __library__ as # TODO: presumptive default. Determine what to do about specificity. + # Pull out the collection of interfaces (potentially one from each of + # the locations indicated in the project configuration file) as a + # sort of pool of information about possible ways in which to submit + # pipeline(s) for sample(s) of the indicated protocol. try: protocol_interfaces = self.interfaces_by_protocol[protocol] except KeyError: _LOGGER.warn("Unknown protocol: '{}'".format(protocol)) return [] + # Collect jobs = [] pipeline_keys_used = set() _LOGGER.debug("Building pipelines for {} PIs...". format(len(protocol_interfaces))) for proto_iface in protocol_interfaces: + # Short-circuit if we care only about the highest-priority match + # for pipeline submission. That is, if the intent is to submit + # pipeline(s) from a single location for each sample of the given + # protocol, we can stop searching the pool of pipeline interface + # information once we've found a match for the protocol. + if priority and 0 != len(jobs): + return jobs[0] + + try: this_protocol_pipelines = \ proto_iface.protomap.mappings[protocol] @@ -629,7 +652,6 @@ def build_pipelines(self, protocol, priority=True): continue # TODO: update once dependency-encoding logic is in place. - _LOGGER.debug("Protocol: {}".format(protocol)) pipeline_keys = this_protocol_pipelines.replace(";", ",")\ .strip(" ()\n")\ .split(",") @@ -660,6 +682,9 @@ def build_pipelines(self, protocol, priority=True): format(len(new_scripts), protocol, proto_iface.pipedir, ", ".join(new_scripts))) + # TODO: determine why comprehension here is over pipeline keys + # TODO: rather than over one of the disjoint subsets that results + # TODO: from the partition procedure based on mapping status. jobs.append([(proto_iface.interface, ) + proto_iface.pipeline_key_to_path(pipeline_key) for pipeline_key in pipeline_keys]) From 16fe8368b83430b5c9457cbd7b6b17252c615244 Mon Sep 17 00:00:00 2001 From: Vince Reuter Date: Sat, 17 Jun 2017 19:34:31 -0400 Subject: [PATCH 23/94] more explanation --- looper/models.py | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/looper/models.py b/looper/models.py index 58b0dbe7..37bca409 100644 --- a/looper/models.py +++ b/looper/models.py @@ -652,9 +652,16 @@ def build_pipelines(self, protocol, priority=True): continue # TODO: update once dependency-encoding logic is in place. - pipeline_keys = this_protocol_pipelines.replace(";", ",")\ - .strip(" ()\n")\ - .split(",") + # The proposed dependency-encoding format uses a semicolon + # between pipelines for which the dependency relationship is + # serial. For now, simply treat those as multiple independent + # pipelines by replacing the semicolon with a comma, which is the + # way in which multiple independent pipelines for a single protocol + # are represented in the mapping declaration. + pipeline_keys = \ + this_protocol_pipelines.replace(";", ",")\ + .strip(" ()\n")\ + .split(",") pipeline_keys = [pk.strip() for pk in pipeline_keys] already_mapped, new_scripts = \ partition(pipeline_keys, @@ -666,7 +673,7 @@ def build_pipelines(self, protocol, priority=True): disjoint_partition_violation = \ set(already_mapped) & set(new_scripts) except TypeError: - _LOGGER.debug("Cannot validate partition") + _LOGGER.debug("Unable to hash partitions for validation") else: assert not disjoint_partition_violation, \ "Partitioning {} with membership in {} as " \ From b60cc834986fd81a6e3933a62e44e471dff5227c Mon Sep 17 00:00:00 2001 From: Vince Reuter Date: Sat, 17 Jun 2017 21:43:01 -0400 Subject: [PATCH 24/94] first pass at finished build_pipelines implementation --- looper/models.py | 104 +++++++++++++++++++-- looper/utils.py | 239 +++++++++++++++++++++++++++-------------------- 2 files changed, 233 insertions(+), 110 deletions(-) diff --git a/looper/models.py b/looper/models.py index 37bca409..b745e61a 100644 --- a/looper/models.py +++ b/looper/models.py @@ -65,7 +65,8 @@ import yaml from .utils import \ - parse_ftype, check_bam, check_fastq, get_file_size, partition + alpha_cased, check_bam, check_fastq, get_file_size, \ + import_from_source, parse_ftype, partition COMPUTE_SETTINGS_VARNAME = "PEPENV" @@ -609,6 +610,9 @@ def build_pipelines(self, protocol, priority=True): pipelines location(s) (indicated in the project config file) that has a match for the given protocol; optional, default True :return Iterable[(PipelineInterface, str, str)]: + :raises AssertionError: if there's a failure in the attempt to + partition an interface's pipeline scripts into disjoint subsets of + those already mapped and those not yet mapped """ # TODO: called from looper.run; do the import and subclass search here @@ -640,10 +644,9 @@ def build_pipelines(self, protocol, priority=True): # pipeline(s) from a single location for each sample of the given # protocol, we can stop searching the pool of pipeline interface # information once we've found a match for the protocol. - if priority and 0 != len(jobs): + if priority and len(jobs) > 0: return jobs[0] - try: this_protocol_pipelines = \ proto_iface.protomap.mappings[protocol] @@ -662,7 +665,12 @@ def build_pipelines(self, protocol, priority=True): this_protocol_pipelines.replace(";", ",")\ .strip(" ()\n")\ .split(",") + # These cleaned pipeline keys are what's used to resolve the path + # to the pipeline to run. Essentially, each pipeline key is a + # pointer to the fully-qualified location of a pipeline pipeline_keys = [pk.strip() for pk in pipeline_keys] + + # Skip over pipelines already been mapped by another location. already_mapped, new_scripts = \ partition(pipeline_keys, partial(_is_member, items=pipeline_keys_used)) @@ -689,13 +697,12 @@ def build_pipelines(self, protocol, priority=True): format(len(new_scripts), protocol, proto_iface.pipedir, ", ".join(new_scripts))) - # TODO: determine why comprehension here is over pipeline keys - # TODO: rather than over one of the disjoint subsets that results - # TODO: from the partition procedure based on mapping status. jobs.append([(proto_iface.interface, ) + proto_iface.pipeline_key_to_path(pipeline_key) - for pipeline_key in pipeline_keys]) + for pipeline_key in new_scripts]) + # Repeat logic check of short-circuit conditional to account for + # edge case in which it's satisfied during the final iteration. return jobs[0] if priority and len(jobs) > 1 else \ list(itertools.chain(*jobs)) @@ -2055,6 +2062,89 @@ def set_read_type(self, n=10, permissive=True): feature, self.name) + @classmethod + def select_sample_subtype(cls, pipeline_filepath, protocol=None): + """ + From a pipeline module, select Sample subtype for a particular protocol. + + The indicated file needs to be a Python module that can be imported. + Critically, it must be written such that importing it does not run it + as a script. That is, its workflow logic should be bundled into + function(s), or at least nested under a "if __name__ == '__main__'" + conditional. + + :param str pipeline_filepath: path to file defining a pipeline + :param str protocol: name of protocol for which to select Sample subtype + :return type: Sample type most tailored to indicated protocol and defined + within the module indicated by the given filepath, optional; if + unspecified, or if the indicated file cannot be imported, then the + base Sample type is returned. Critically, the indicated + """ + + if not _os.path.isfile(pipeline_filepath): + _LOGGER.debug("Alleged pipeline module path is not a file: '%s'", + pipeline_filepath) + return cls + + # Determine whether it appears safe to import the pipeline module, + # and return a generic, base Sample if not. + import subprocess + def file_has_pattern(pattern, filepath): + try: + with open(_os.devnull, 'w') as devnull: + return subprocess.call( + ["grep", pattern, filepath], stdout=devnull) + except Exception: + return False + safety_lines = ["if __name__ == '__main__'", + "if __name__ == \"__main__\""] + safe_to_import = \ + any(map(partial(file_has_pattern, + filepath=pipeline_filepath), + safety_lines)) + if not safe_to_import: + _LOGGER.debug("Attempt to import '{}' may run code so is refused.". + format(pipeline_filepath)) + return cls + + # Import pipeline module and find Sample subtypes. + _, modname = _os.path.split(pipeline_filepath) + modname, _ = _os.path.splitext(modname) + pipeline_module = import_from_source( + name=modname, module_filepath=pipeline_filepath) + _LOGGER.debug("Successfully imported pipeline module '%s', " + "naming it '%s'", pipeline_filepath, + pipeline_module.__name__) + import inspect + sample_subtypes = inspect.getmembers( + pipeline_module, lambda obj: isinstance(obj, Sample)) + _LOGGER.debug("%d sample subtype(s): %s", len(sample_subtypes), + ", ".join([subtype.__name__ + for subtype in sample_subtypes])) + + # Attempt to match protocol to subtype. + protocol_key = alpha_cased(protocol) + matched_subtypes = [subtype for subtype in sample_subtypes if + protocol_key == alpha_cased(subtype.__library__)] + subtype_by_protocol_text = \ + ", ".join(["'{}' ({})".format(subtype.__library, subtype) + for subtype in sample_subtypes]) + if 0 == len(matched_subtypes): + _LOGGER.debug("No known Sample subtype for protocol '%s'; " + "known: %s", protocol, subtype_by_protocol_text) + return Sample + elif 1 == len(matched_subtypes): + subtype = matched_subtypes[0] + _LOGGER.info("Matched protocol '%s' to Sample subtype %s", + protocol, subtype.__name__) + return subtype + else: + _LOGGER.debug("Unable to choose from %d Sample subtype matches " + "for protocol '%s': %s", len(matched_subtypes), + protocol, subtype_by_protocol_text) + return Sample + + @copy class PipelineInterface(object): diff --git a/looper/utils.py b/looper/utils.py index e65eb841..4fe3c241 100644 --- a/looper/utils.py +++ b/looper/utils.py @@ -21,18 +21,71 @@ def format_help(self): +def alpha_cased(text, lower=False): + """ + Filter text to just letters and homogenize case. + + :param str text: what to filter and homogenize. + :param bool lower: whether to convert to lowercase; default uppercase. + :return str: input filtered to just letters, with homogenized case. + """ + text = "".join(filter(lambda c: c.isalpha(), text)) + return text.lower() if lower else text.upper() + + + +def check_bam(bam, o): + """ + Check reads in BAM file for read type and lengths. + + :param str bam: BAM file path. + :param int o: Number of reads to look at for estimation. + """ + try: + p = sp.Popen(['samtools', 'view', bam], stdout=sp.PIPE) + # Count paired alignments + paired = 0 + read_length = Counter() + while o > 0: # Count down number of lines + line = p.stdout.readline().decode().split("\t") + flag = int(line[1]) + read_length[len(line[9])] += 1 + if 1 & flag: # check decimal flag contains 1 (paired) + paired += 1 + o -= 1 + p.kill() + except OSError: + reason = "Note (samtools not in path): For NGS inputs, " \ + "looper needs samtools to auto-populate " \ + "'read_length' and 'read_type' attributes; " \ + "these attributes were not populated." + raise OSError(reason) + + _LOGGER.debug("Read lengths: {}".format(read_length)) + _LOGGER.debug("paired: {}".format(paired)) + return read_length, paired + + + +def check_fastq(fastq, o): + raise NotImplementedError("Detection of read type/length for " + "fastq input is not yet implemented.") + + + def fetch_package_classes(pkg, predicate=None): """ Enable single-depth fetch of package's classes if not exported. - + :param module pkg: the package of interest. - :param function(type) -> bool predicate: condition each class must + :param function(type) -> bool predicate: condition each class must satisfy in order to be returned. - :return Iterable(type): classes one layer deep within the package, that + :return Iterable(type): classes one layer deep within the package, that satisfy the condition if given. """ import inspect import itertools + modules = [pkg] if inspect.ismodule(pkg) else \ [obj for obj in inspect.getmembers( pkg, lambda member: inspect.ismodule(member))] @@ -41,6 +94,79 @@ def fetch_package_classes(pkg, predicate=None): +def get_file_size(filename): + """ + Get size of all files in gigabytes (Gb). + + :param str | collections.Iterable[str] filename: A space-separated + string or list of space-separated strings of absolute file paths. + :return float: size of file(s), in gigabytes. + """ + if filename is None: + return float(0) + if type(filename) is list: + return float(sum([get_file_size(x) for x in filename])) + try: + total_bytes = sum([float(os.stat(f).st_size) + for f in filename.split(" ") if f is not '']) + except OSError: + # File not found + return 0.0 + else: + return float(total_bytes) / (1024 ** 3) + + + +def import_from_source(name, module_filepath): + """ + Import a module from a particular filesystem location. + + :param str name: name for the module when loaded + :param str module_filepath: path to the file that constitutes the module + to import + :return module: module imported from the given location, named as indicated + """ + import sys + + if sys.version_info >= (3, 5): + from importlib import util as _il_util + modspec = _il_util.spec_from_file_module_filepath( + name, module_filepath) + mod = _il_util.module_from_spec(modspec) + modspec.loader.exec_module(mod) + elif sys.version_info < (3, 3): + import imp + mod = imp.load_source(name, module_filepath) + else: + # 3.3 or 3.4 + from importlib import machinery + mod = machinery.SourceFileLoader(name, module_filepath) + + return mod + + + +def parse_ftype(input_file): + """ + Checks determine filetype from extension. + + :param str input_file: String to check. + :return str: filetype (extension without dot prefix) + :raises TypeError: if file does not appear of a supported type + """ + if input_file.endswith(".bam"): + return "bam" + elif input_file.endswith(".fastq") or \ + input_file.endswith(".fq") or \ + input_file.endswith(".fq.gz") or \ + input_file.endswith(".fastq.gz"): + return "fastq" + else: + raise TypeError("Type of input file ends in neither '.bam' " + "nor '.fastq' [file: '" + input_file + "']") + + + def parse_text_data(lines_or_path, delimiter=os.linesep): """ Interpret input argument as lines of data. This is intended to support @@ -98,18 +224,6 @@ def partition(items, test): -# TODO: -# It appears that this isn't currently used. -# It could be included as a validation stage in Project instantiation. -# If Project instance being validated lacked specific relevant -# configuration section the call here would either need to be skipped, -# or this would need to pass in such a scenario. That would not be -# a challenge, but it just needs to be noted. - -# TODO: -# Test this with additional pipeline config file, -# pointed to in relevant section of project config file: -# http://looper.readthedocs.io/en/latest/define-your-project.html#project-config-section-pipeline-config class CommandChecker(object): """ Validate PATH availability of executables referenced by a config file. @@ -125,8 +239,10 @@ class CommandChecker(object): :param sections_to_skip: analogous to the check names parameter, but for specific sections to skip. :type sections_to_skip: Iterable[str] - + """ + + def __init__(self, path_conf_file, sections_to_check=None, sections_to_skip=None): @@ -143,9 +259,9 @@ def __init__(self, path_conf_file, # Determine which sections to validate. sections = {sections_to_check} if isinstance(sections_to_check, str) \ - else set(sections_to_check or conf_data.keys()) + else set(sections_to_check or conf_data.keys()) excl = {sections_to_skip} if isinstance(sections_to_skip, str) \ - else set(sections_to_skip or []) + else set(sections_to_skip or []) sections -= excl self._logger.info("Validating %d sections: %s", @@ -155,8 +271,8 @@ def __init__(self, path_conf_file, # Store per-command mapping of status, nested under section. self.section_to_status_by_command = defaultdict(dict) # Store only information about the failures. - self.failures_by_section = defaultdict(list) # Access by section. - self.failures = set() # Access by command. + self.failures_by_section = defaultdict(list) # Access by section. + self.failures = set() # Access by command. for s in sections: # Fetch section data or skip. @@ -245,86 +361,3 @@ def is_command_callable(command, name=""): _LOGGER.debug("Command{0}is not callable: {1}". format(alias_value, command)) return not bool(code) - - - -def parse_ftype(input_file): - """ - Checks determine filetype from extension. - - :param str input_file: String to check. - :return str: filetype (extension without dot prefix) - :raises TypeError: if file does not appear of a supported type - """ - if input_file.endswith(".bam"): - return "bam" - elif input_file.endswith(".fastq") or \ - input_file.endswith(".fq") or \ - input_file.endswith(".fq.gz") or \ - input_file.endswith(".fastq.gz"): - return "fastq" - else: - raise TypeError("Type of input file ends in neither '.bam' " - "nor '.fastq' [file: '" + input_file + "']") - - - -def check_bam(bam, o): - """ - Check reads in BAM file for read type and lengths. - - :param str bam: BAM file path. - :param int o: Number of reads to look at for estimation. - """ - try: - p = sp.Popen(['samtools', 'view', bam], stdout=sp.PIPE) - # Count paired alignments - paired = 0 - read_length = Counter() - while o > 0: # Count down number of lines - line = p.stdout.readline().decode().split("\t") - flag = int(line[1]) - read_length[len(line[9])] += 1 - if 1 & flag: # check decimal flag contains 1 (paired) - paired += 1 - o -= 1 - p.kill() - except OSError: - reason = "Note (samtools not in path): For NGS inputs, " \ - "looper needs samtools to auto-populate " \ - "'read_length' and 'read_type' attributes; " \ - "these attributes were not populated." - raise OSError(reason) - - _LOGGER.debug("Read lengths: {}".format(read_length)) - _LOGGER.debug("paired: {}".format(paired)) - return read_length, paired - - - -def check_fastq(fastq, o): - raise NotImplementedError("Detection of read type/length for " - "fastq input is not yet implemented.") - - - -def get_file_size(filename): - """ - Get size of all files in gigabytes (Gb). - - :param str | collections.Iterable[str] filename: A space-separated - string or list of space-separated strings of absolute file paths. - :return float: size of file(s), in gigabytes. - """ - if filename is None: - return float(0) - if type(filename) is list: - return float(sum([get_file_size(x) for x in filename])) - try: - total_bytes = sum([float(os.stat(f).st_size) - for f in filename.split(" ") if f is not '']) - except OSError: - # File not found - return 0.0 - else: - return float(total_bytes) / (1024 ** 3) From 3eb273955934c1ec69a21f3966d82d7b1849948f Mon Sep 17 00:00:00 2001 From: Vince Reuter Date: Sat, 17 Jun 2017 23:12:02 -0400 Subject: [PATCH 25/94] creating subtype for each pipeline submission --- looper/looper.py | 3 +- looper/models.py | 96 +++++++++++++++++++++++++++++++----------------- 2 files changed, 65 insertions(+), 34 deletions(-) diff --git a/looper/looper.py b/looper/looper.py index 59138f1f..4f1c5d4a 100755 --- a/looper/looper.py +++ b/looper/looper.py @@ -248,7 +248,8 @@ def run(prj, args, remaining_args): # Go through all pipelines to submit for this protocol. # Note: control flow doesn't reach this point if variable "pipelines" # cannot be assigned (library/protocol missing). - for pipeline_interface, pipeline_key, pipeline_job in pipelines: + for pipeline_interface, sample_subtype, pipeline_key, pipeline_job \ + in pipelines: # pipeline_key (previously pl_id) is no longer necessarily script name, it's more flexible. # The current sample is active. diff --git a/looper/models.py b/looper/models.py index b745e61a..fc9b7a73 100644 --- a/looper/models.py +++ b/looper/models.py @@ -49,7 +49,8 @@ # TODO: the examples changes would involve library and output_dir. from collections import \ - defaultdict, Iterable, Mapping, MutableMapping, OrderedDict as _OrderedDict + defaultdict, Iterable, Mapping, MutableMapping, namedtuple, \ + OrderedDict as _OrderedDict from functools import partial import glob import itertools @@ -328,6 +329,12 @@ def process_pipeline_interfaces(pipeline_interface_locations): +SampleSubmission = namedtuple( + "SampleSubmission", + field_names=["interface", "subtype", "pipeline", "command"]) + + + @copy class Project(AttributeDict): """ @@ -697,9 +704,18 @@ def build_pipelines(self, protocol, priority=True): format(len(new_scripts), protocol, proto_iface.pipedir, ", ".join(new_scripts))) - jobs.append([(proto_iface.interface, ) + - proto_iface.pipeline_key_to_path(pipeline_key) - for pipeline_key in new_scripts]) + new_jobs = [] + for pipeline_key in new_scripts: + strict_pipe_key, full_pipe_path, cmd = \ + proto_iface.pipeline_key_to_path(pipeline_key) + sample_subtype = Sample.select_sample_subtype( + full_pipe_path, protocol) + submission = SampleSubmission( + proto_iface.interface, sample_subtype, + strict_pipe_key, cmd) + new_jobs.append(submission) + + jobs.append(new_jobs) # Repeat logic check of short-circuit conditional to account for # edge case in which it's satisfied during the final iteration. @@ -2121,28 +2137,43 @@ def file_has_pattern(pattern, filepath): _LOGGER.debug("%d sample subtype(s): %s", len(sample_subtypes), ", ".join([subtype.__name__ for subtype in sample_subtypes])) - - # Attempt to match protocol to subtype. - protocol_key = alpha_cased(protocol) - matched_subtypes = [subtype for subtype in sample_subtypes if - protocol_key == alpha_cased(subtype.__library__)] + + # Match all subtypes for null protocol; use __library__ for non-null. + if protocol is None: + _LOGGER.debug("Null protocol, matching every subtypes...") + matched_subtypes = sample_subtypes + else: + protocol_key = alpha_cased(protocol) + matched_subtypes = \ + [subtype for subtype in sample_subtypes + if protocol_key == alpha_cased(subtype.__library__)] + + # Helpful for messages about protocol name for each subtype subtype_by_protocol_text = \ ", ".join(["'{}' ({})".format(subtype.__library, subtype) for subtype in sample_subtypes]) + + # Select subtype based on match count. if 0 == len(matched_subtypes): - _LOGGER.debug("No known Sample subtype for protocol '%s'; " - "known: %s", protocol, subtype_by_protocol_text) - return Sample + # Fall back to base Sample if we have no matches. + _LOGGER.debug( + "No known Sample subtype for protocol '{}' in '{}'; " + "known: {}".format(protocol, pipeline_filepath, + subtype_by_protocol_text)) + return cls elif 1 == len(matched_subtypes): + # Use the single match if there's exactly one. subtype = matched_subtypes[0] - _LOGGER.info("Matched protocol '%s' to Sample subtype %s", - protocol, subtype.__name__) + _LOGGER.info("Matched protocol '{}' to Sample subtype {}". + format(protocol, subtype.__name__)) return subtype else: - _LOGGER.debug("Unable to choose from %d Sample subtype matches " - "for protocol '%s': %s", len(matched_subtypes), - protocol, subtype_by_protocol_text) - return Sample + # Throw up our hands and fall back to base Sample for multi-match. + _LOGGER.debug("Unable to choose from {} Sample subtype matches " + "for protocol '{}' in '{}': {}".format( + len(matched_subtypes), protocol, + pipeline_filepath, subtype_by_protocol_text)) + return cls @@ -2461,8 +2492,8 @@ def pipeline_key_to_path(self, pipeline_key): :param str pipeline_key: the key in the pipeline interface file used for the protocol_mappings section. Previously was the script name. - :return (str, str): more restrictive version of input key, along with - absolute path for pipeline script. + :return (str, str, str): more precise version of input key, along with + absolute path for pipeline script, and full script path + options """ @@ -2472,25 +2503,24 @@ def pipeline_key_to_path(self, pipeline_key): if self.interface.get_attribute(strict_pipeline_key, "path"): script_path_only = self.interface.get_attribute( - strict_pipeline_key, "path")[0] - script_path_with_flags = " ".join([script_path_only, pipeline_key_args]) + strict_pipeline_key, "path")[0].strip() + script_path_with_flags = \ + " ".join([script_path_only, pipeline_key_args]) else: # backwards compatibility w/ v0.5 script_path_only = strict_pipeline_key script_path_with_flags = pipeline_key - if _os.path.isabs(script_path_only): - if not _os.path.exists(script_path_only.strip()): - _LOGGER.warn("Missing script command: '{}'".format(script_path_only)) - return strict_pipeline_key, script_path_with_flags - else: - abs_script_path_only = _os.path.join(self.pipelines_path, script_path_only) - abs_script_path_with_flags = _os.path.join(self.pipelines_path, script_path_with_flags) + if not _os.path.isabs(script_path_only): + script_path_only = _os.path.join( + self.pipelines_path, script_path_only) + script_path_with_flags = _os.path.join( + self.pipelines_path, script_path_with_flags) + if not _os.path.exists(script_path_only): + _LOGGER.warn( + "Missing script command: '{}'".format(script_path_only)) + return strict_pipeline_key, script_path_only, script_path_with_flags - if not _os.path.isfile(abs_script_path_only.strip()): - _LOGGER.warn("Missing script command: '{}'". - format(abs_script_path_only)) - return strict_pipeline_key, abs_script_path_with_flags @copy From 79c6e39f64d54bdc9e0db8ce5638070b4667c0ea Mon Sep 17 00:00:00 2001 From: Vince Reuter Date: Sat, 17 Jun 2017 23:18:32 -0400 Subject: [PATCH 26/94] TODO items --- looper/looper.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/looper/looper.py b/looper/looper.py index 4f1c5d4a..5dc7568a 100755 --- a/looper/looper.py +++ b/looper/looper.py @@ -231,6 +231,8 @@ def run(prj, args, remaining_args): protocol = protocol.upper() _LOGGER.debug("Building pipeline(s) for protocol: '{}'". format(protocol)) + # TODO: this should be called just once per protocol, not + # TODO: for every Sample, as the call passes no Sample data. pipelines = prj.build_pipelines(protocol) if len(pipelines) == 0: skip_reasons.append( @@ -245,6 +247,9 @@ def run(prj, args, remaining_args): processed_samples.add(sample.sample_name) sample.to_yaml() + # TODO: determine whether it's before or after this point that the + # TODO: specific subtype should be created. + # Go through all pipelines to submit for this protocol. # Note: control flow doesn't reach this point if variable "pipelines" # cannot be assigned (library/protocol missing). @@ -256,8 +261,8 @@ def run(prj, args, remaining_args): # For each pipeline submission consideration, start fresh. skip_reasons = [] - _LOGGER.debug("Setting pipeline attributes for job '{}' (PL_ID: '{}')". - format(pipeline_job, pipeline_key)) + _LOGGER.debug("Setting pipeline attributes for job '{}' " + "(PL_ID: '{}')".format(pipeline_job, pipeline_key)) try: # Add pipeline-specific attributes. From 258ee99680ef4ac5e3f7e8ce99aaa710c8298ccd Mon Sep 17 00:00:00 2001 From: Vince Reuter Date: Sun, 18 Jun 2017 02:51:29 -0400 Subject: [PATCH 27/94] better control of when to build pipelines; iteration over protocols; finish of base Sample creation logic in the generator property; add in merge_table logic --- looper/looper.py | 18 ++-- looper/models.py | 218 ++++++++++++++++++++++++++++++++--------------- 2 files changed, 158 insertions(+), 78 deletions(-) diff --git a/looper/looper.py b/looper/looper.py index 5dc7568a..156263e2 100755 --- a/looper/looper.py +++ b/looper/looper.py @@ -194,6 +194,9 @@ def run(prj, args, remaining_args): # Create a problem list so we can keep track and show them at the end. failures = [] + submission_bundle_by_protocol = \ + {p: prj.build_pipelines(p) for p in prj.protocols} + for sample in prj.samples: _LOGGER.debug(sample) _LOGGER.info(_COUNTER.show(sample.sample_name, sample.library)) @@ -231,10 +234,9 @@ def run(prj, args, remaining_args): protocol = protocol.upper() _LOGGER.debug("Building pipeline(s) for protocol: '{}'". format(protocol)) - # TODO: this should be called just once per protocol, not - # TODO: for every Sample, as the call passes no Sample data. - pipelines = prj.build_pipelines(protocol) - if len(pipelines) == 0: + try: + pipelines = submission_bundle_by_protocol[protocol] + except KeyError: skip_reasons.append( "No pipeline found for protocol {}".format(protocol)) @@ -243,27 +245,25 @@ def run(prj, args, remaining_args): failures.append([skip_reasons, sample.sample_name]) continue + # TODO: determine what to do with subtype(s) here. # Processing preconditions have been met. processed_samples.add(sample.sample_name) sample.to_yaml() - # TODO: determine whether it's before or after this point that the - # TODO: specific subtype should be created. - # Go through all pipelines to submit for this protocol. # Note: control flow doesn't reach this point if variable "pipelines" # cannot be assigned (library/protocol missing). for pipeline_interface, sample_subtype, pipeline_key, pipeline_job \ in pipelines: + # pipeline_key (previously pl_id) is no longer necessarily + # script name, it's more flexible. - # pipeline_key (previously pl_id) is no longer necessarily script name, it's more flexible. # The current sample is active. # For each pipeline submission consideration, start fresh. skip_reasons = [] _LOGGER.debug("Setting pipeline attributes for job '{}' " "(PL_ID: '{}')".format(pipeline_job, pipeline_key)) - try: # Add pipeline-specific attributes. sample.set_pipeline_attributes( diff --git a/looper/models.py b/looper/models.py index fc9b7a73..e54a7617 100644 --- a/looper/models.py +++ b/looper/models.py @@ -329,12 +329,93 @@ def process_pipeline_interfaces(pipeline_interface_locations): -SampleSubmission = namedtuple( - "SampleSubmission", +SubmissionBundle = namedtuple( + "SubmissionBundle", field_names=["interface", "subtype", "pipeline", "command"]) +def merge_sample(sample, merge_table, derived_columns): + + if SAMPLE_NAME_COLNAME not in merge_table.columns: + raise KeyError( + "Merge table requires a column named '{}'.". + format(SAMPLE_NAME_COLNAME)) + + sample_indexer = merge_table[SAMPLE_NAME_COLNAME] == \ + getattr(sample, SAMPLE_NAME_COLNAME) + merge_rows = merge_table[sample_indexer] + + if len(merge_rows) > 0: + # For each row in the merge table of this sample: + # 1) populate any derived columns + # 2) derived columns --> space-delimited strings + # 3) update the sample values with the merge table + + # Keep track of merged cols, + # so we don't re-derive them later. + merged_cols = { + key: "" for key in merge_rows.columns} + for _, row in merge_rows.iterrows(): + row_dict = row.to_dict() + for col in merge_rows.columns: + if col == SAMPLE_NAME_COLNAME or \ + col not in derived_columns: + continue + # Initialize key in parent dict. + col_key = col + COL_KEY_SUFFIX + merged_cols[col_key] = "" + row_dict[col_key] = row_dict[col] + row_dict[col] = sample.locate_data_source( + col, row_dict[col], row_dict) # 1) + + # Also add in any derived cols present. + for col in derived_columns: + # Skip over attributes that the sample + # either lacks, and those covered by the + # data from the current (row's) data. + if not hasattr(sample, col) or \ + col in row_dict: + continue + # Map column name key to sample's value + # for the attribute given by column name. + col_key = col + COL_KEY_SUFFIX + row_dict[col_key] = getattr(sample, col) + # Map the column name itself to the + # populated data source template string. + row_dict[col] = sample.locate_data_source( + col, getattr(sample, col), row_dict) + _LOGGER.debug("PROBLEM adding derived column: " + "{}, {}, {}".format(col, row_dict[col], + getattr(sample, col))) + + # Since we are now jamming multiple (merged) + # entries into a single attribute, we have to + # join them into a space-delimited string + # and then set to sample attribute. + for key, val in row_dict.items(): + if key == SAMPLE_NAME_COLNAME or not val: + continue + _LOGGER.debug("merge: sample '%s'; %s=%s", + str(sample.name), str(key), str(val)) + if not key in merged_cols: + new_val = str(val).rstrip() + else: + new_val = "{} {}".format( + merged_cols[key], str(val)).strip() + merged_cols[key] = new_val # 2) + + # Don't update sample_name. + merged_cols.pop(SAMPLE_NAME_COLNAME, None) + + sample.update(merged_cols) # 3) + sample.merged = True # mark sample as merged + sample.merged_cols = merged_cols + + return sample + + + @copy class Project(AttributeDict): """ @@ -521,6 +602,22 @@ def project_folders(self): return ["results_subdir", "submission_subdir"] + @property + def protocols(self): + """ + Determine this Project's unique protocol names. + + :return Set[str]: collection of this Project's unique protocol names + """ + protos = set() + for s in self.samples: + try: + protos.add(s.library) + except AttributeError: + _LOGGER.debug("Sample '%s' lacks protocol", s.sample_name) + return protos + + @property def required_metadata(self): """ @@ -544,39 +641,49 @@ def sample_names(self): @property def samples(self): + """ + Generic/base Sample instance for each of this Project's samples. + + :return generator[Sample]: Sample instance for each + of this Project's samples + """ # TODO: account for merge table; store or re-merge every time? # TODO: is it more likely to have a bunch of samples, or that # TODO: use of this and thus the need to re-merge is very frequent? - for _, row in self.sheet.df.iterrows(): - yield Sample(row.dropna()) - - - def samples_for(self, pipeline=None): - """ - Fetch each of this Project's Sample objects for given pipeline. - - If Sample objects have not yet been created for this Project, create - them as needed. What is needed and what is returned are determined by - the argument to the pipeline parameter. If no argument is provided as - the desired pipeline, it's assumed that the desire is for all possible - Sample objects to be created and return based on the samples that and - pipelines of which this Project is aware, and the relationships - between them. + if hasattr(self.metadata, "merge_table"): + if self.merge_table is None: + if _os.path.isfile(self.metadata.merge_table): + self.merge_table = _pd.read_table( + self.metadata.merge_table, + sep=None, engine="python") + else: + _LOGGER.debug("Alleged path to merge table data is not " + "a file: '%s'", self.metadata.merge_table) + else: + _LOGGER.debug("Already parsed merge table") + else: + _LOGGER.debug("No merge table") - :param str pipeline: name of pipeline for which to fetch samples - :return Iterable[Sample]: - """ - if not self._samples_by_pipeline: - # TODO: also ensure that case of unknown pipeline is covered here. - self.create_samples(pipeline) - if pipeline is None: - return list(itertools.chain(*self._samples_by_pipeline.values())) + if self.merge_table is None: + def merge(s): + return s else: + def merge(s): + return merge_sample(s, self.merge_table, self.derived_columns) + + for _, row in self.sheet.df.iterrows(): + sample = Sample(row.dropna()) + if hasattr(sample, "organism"): + sample.get_genome_transcriptome() + sample.set_file_paths() + # Hack for backwards-compatibility + # Pipelines should now use `data_source`) try: - return self._samples_by_pipeline[pipeline] - except KeyError: - _LOGGER.error("Unknown pipeline: '%s'", pipeline) - return [] + sample.data_path = sample.data_source + except AttributeError: + _LOGGER.debug("Sample '%s' lacks data source --> skipping " + "data path assignment", sample.sample_name) + yield merge(sample) @property @@ -640,8 +747,7 @@ def build_pipelines(self, protocol, priority=True): _LOGGER.warn("Unknown protocol: '{}'".format(protocol)) return [] - # Collect - jobs = [] + job_submission_bundles = [] pipeline_keys_used = set() _LOGGER.debug("Building pipelines for {} PIs...". format(len(protocol_interfaces))) @@ -651,8 +757,8 @@ def build_pipelines(self, protocol, priority=True): # pipeline(s) from a single location for each sample of the given # protocol, we can stop searching the pool of pipeline interface # information once we've found a match for the protocol. - if priority and len(jobs) > 0: - return jobs[0] + if priority and len(job_submission_bundles) > 0: + return job_submission_bundles[0] try: this_protocol_pipelines = \ @@ -706,50 +812,23 @@ def build_pipelines(self, protocol, priority=True): new_jobs = [] for pipeline_key in new_scripts: - strict_pipe_key, full_pipe_path, cmd = \ + strict_pipe_key, full_pipe_path, full_pipe_path_with_flags = \ proto_iface.pipeline_key_to_path(pipeline_key) sample_subtype = Sample.select_sample_subtype( full_pipe_path, protocol) - submission = SampleSubmission( + submission_bundle = SubmissionBundle( proto_iface.interface, sample_subtype, - strict_pipe_key, cmd) - new_jobs.append(submission) + strict_pipe_key, full_pipe_path_with_flags) + new_jobs.append(submission_bundle) - jobs.append(new_jobs) + job_submission_bundles.append(new_jobs) # Repeat logic check of short-circuit conditional to account for # edge case in which it's satisfied during the final iteration. - return jobs[0] if priority and len(jobs) > 1 else \ - list(itertools.chain(*jobs)) - - - def create_samples(self, pipeline=None): - """ - Create Samples for this Project, for a particular pipeline if given. - - If Sample(s) already exist for the given pipeline, nothing is done. - This ensures creation work isn't duplicated but assumes that samples - are not added to a project after it's constructed, or at least not - for a type for which Sample objects have already been created. - - :param str pipeline: name of pipeline for which to create Samples - """ - if self.merge_table is None: - try: - if _os.path.isfile(self.metadata.merge_table): - self.merge_table = _pd.read_table( - self.metadata.merge_table, - sep=None, engine="python") - else: - _LOGGER.debug("Alleged merge_table file does not exist: " - "'%s'", self.metadata.merge_table) - except AttributeError: - _LOGGER.debug("No merge table") - if pipeline in self._samples_by_pipeline: - _LOGGER.debug("Sample(s) already exist for pipeline '%s'", - pipeline) - return - + if priority and len(job_submission_bundles) > 1: + return job_submission_bundles[0] + else: + return list(itertools.chain(*job_submission_bundles)) def finalize_pipelines_directory(self, pipe_path=""): @@ -1089,6 +1168,7 @@ def update_environment(self, env_settings_file): self.environment_file = env_settings_file + # TODO: remove once confident in replacement. def add_sample_sheet(self, csv=None, sample_builder=None): """ Build a `SampleSheet` object from a csv file and From 136faec4fbf2ab1abf3fd791940bc07bbaa70032 Mon Sep 17 00:00:00 2001 From: Vince Reuter Date: Sun, 18 Jun 2017 02:57:14 -0400 Subject: [PATCH 28/94] slight cleanup and comments --- looper/models.py | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/looper/models.py b/looper/models.py index e54a7617..24358e46 100644 --- a/looper/models.py +++ b/looper/models.py @@ -329,13 +329,22 @@ def process_pipeline_interfaces(pipeline_interface_locations): +# Collect PipelineInterface, Sample type, pipeline path, and script with flags. SubmissionBundle = namedtuple( "SubmissionBundle", - field_names=["interface", "subtype", "pipeline", "command"]) + field_names=["interface", "subtype", "pipeline", "pipeline_with_flags"]) def merge_sample(sample, merge_table, derived_columns): + """ + Use merge table data to augment/modify Sample. + + :param Sample sample: sample to modify via merge table data + :param merge_table: data with which to alter Sample + :param derived_columns: names of columns with data-derived value + :return Sample: updated input instance + """ if SAMPLE_NAME_COLNAME not in merge_table.columns: raise KeyError( @@ -1069,15 +1078,6 @@ def parse_config_file(self, subproject=None): path_config_file=self.config_file) - def pipelines_by_sample(self): - pass - - - def samples_by_pipeline(self): - pass - - - def set_compute(self, setting): """ Set the compute attributes according to the From eb3d39b84557f7e730c5a6b39695fd11a4c22907 Mon Sep 17 00:00:00 2001 From: Vince Reuter Date: Sun, 18 Jun 2017 13:46:33 -0400 Subject: [PATCH 29/94] first pass at creating the Sample subtype in looper; getting YAML file for the base Sample and for each subtype; alphabetizing Sample class functions --- looper/looper.py | 11 +- looper/models.py | 476 +++++++++++++++++++++++++---------------------- 2 files changed, 258 insertions(+), 229 deletions(-) diff --git a/looper/looper.py b/looper/looper.py index 156263e2..149a55a6 100755 --- a/looper/looper.py +++ b/looper/looper.py @@ -235,7 +235,7 @@ def run(prj, args, remaining_args): _LOGGER.debug("Building pipeline(s) for protocol: '{}'". format(protocol)) try: - pipelines = submission_bundle_by_protocol[protocol] + submission_bundles = submission_bundle_by_protocol[protocol] except KeyError: skip_reasons.append( "No pipeline found for protocol {}".format(protocol)) @@ -249,15 +249,22 @@ def run(prj, args, remaining_args): # Processing preconditions have been met. processed_samples.add(sample.sample_name) sample.to_yaml() + sample_data = sample.as_series().to_dict() # Go through all pipelines to submit for this protocol. # Note: control flow doesn't reach this point if variable "pipelines" # cannot be assigned (library/protocol missing). for pipeline_interface, sample_subtype, pipeline_key, pipeline_job \ - in pipelines: + in submission_bundles: # pipeline_key (previously pl_id) is no longer necessarily # script name, it's more flexible. + _LOGGER.debug("Creating %s instance for sample '%s'", + sample_subtype.__name__, sample.sample_name) + sample = sample_subtype(sample_data) + pipeline_name, _ = os.path.splitext(pipeline_key) + sample.to_yaml(pipeline_name=pipeline_name) + # The current sample is active. # For each pipeline submission consideration, start fresh. skip_reasons = [] diff --git a/looper/models.py b/looper/models.py index 24358e46..c747a614 100644 --- a/looper/models.py +++ b/looper/models.py @@ -692,7 +692,9 @@ def merge(s): except AttributeError: _LOGGER.debug("Sample '%s' lacks data source --> skipping " "data path assignment", sample.sample_name) - yield merge(sample) + sample = merge(sample) + sample.prj = self + yield sample @property @@ -1615,6 +1617,8 @@ def __init__(self, series): if isinstance(series, _pd.Series): series = series.to_dict() + elif isinstance(series, Sample): + series = series.as_series().to_dict() # Set series attributes on self. for key, value in series.items(): @@ -1645,10 +1649,6 @@ def __init__(self, series): # call Sample.set_file_paths(). - def __repr__(self): - return "Sample '{}'".format(self.name) - - def __getitem__(self, item): """ Provides dict-style access to attributes @@ -1659,12 +1659,18 @@ def __getitem__(self, item): raise KeyError(item) - def update(self, newdata): + def __repr__(self): + return "Sample '{}'".format(self.name) + + + def as_series(self): """ - Update Sample object with attributes from a dict. + Returns a `pandas.Series` object with all the sample's attributes. + + :return pandas.core.series.Series: pandas Series representation + of this Sample, with its attributes. """ - for key, value in newdata.items(): - setattr(self, key, value) + return _pd.Series(self.__dict__) def check_valid(self, required=None): @@ -1684,6 +1690,61 @@ def check_valid(self, required=None): return lacking + def confirm_required_inputs(self, permissive=False): + + # set_pipeline_attributes must be run first. + if not hasattr(self, "required_inputs"): + _LOGGER.warn("You must run set_pipeline_attributes " + "before confirm_required_inputs") + return True + + if not self.required_inputs: + _LOGGER.debug("No required inputs") + return True + + # First, attributes + for file_attribute in self.required_inputs_attr: + _LOGGER.debug("Checking '{}'".format(file_attribute)) + if not hasattr(self, file_attribute): + message = "Missing required input attribute '{}'".\ + format(file_attribute) + _LOGGER.warn(message) + if not permissive: + raise IOError(message) + else: + return False + if getattr(self, file_attribute) is "": + message = "Empty required input attribute '{}'".\ + format(file_attribute) + _LOGGER.warn(message) + if not permissive: + raise IOError(message) + else: + return False + + # Second, files + missing_files = [] + for paths in self.required_inputs: + # There can be multiple, space-separated values here. + for path in paths.split(" "): + _LOGGER.debug("Checking path: '{}'".format(path)) + if not _os.path.exists(path): + _LOGGER.warn("Missing required input file: '{}'".format(path)) + missing_files.append(path) + + if len(missing_files) > 0: + message = "Missing/unreadable file(s): {}".\ + format(", ".join(["'{}'".format(path) + for path in missing_files])) + if not permissive: + raise IOError(message) + else: + _LOGGER.error(message) + return False + + return True + + def generate_name(self): """ Generate name for the sample by joining some of its attribute strings. @@ -1691,82 +1752,123 @@ def generate_name(self): raise NotImplementedError("Not implemented in new code base.") - def as_series(self): + def get_attr_values(self, attrlist): """ - Returns a `pandas.Series` object with all the sample's attributes. + Get value corresponding to each given attribute. - :return pandas.core.series.Series: pandas Series representation - of this Sample, with its attributes. + :param str attrlist: name of an attribute storing a list of attr names + :return list: value (or empty string) corresponding to each named attr """ - return _pd.Series(self.__dict__) + if not hasattr(self, attrlist): + return None + + attribute_list = getattr(self, attrlist) + + # If attribute is None, then value is also None. + if not attribute_list: + return None + + if not isinstance(attribute_list, list): + attribute_list = [attribute_list] + + # Strings contained here are appended later so shouldn't be null. + return [getattr(self, attr) if hasattr(self, attr) else "" + for attr in attribute_list] - def to_yaml(self, path=None): + def get_genome_transcriptome(self): """ - Serializes itself in YAML format. + Get genome and transcriptome, based on project config file. + If not available (matching config), genome and transcriptome + will be set to sample.organism. + """ + try: + self.genome = getattr(self.prj.genomes, self.organism) + except AttributeError: + _LOGGER.debug("Project config lacks genome mapping for " + "organism '%s'", str(self.organism)) + try: + self.transcriptome = getattr(self.prj.transcriptomes, self.organism) + except AttributeError: + _LOGGER.debug("Project config lacks transcriptome mapping for " + "organism '%s'", str(self.organism)) - :param str path: A file path to write yaml to. + + def get_sheet_dict(self): """ - def obj2dict(obj, to_skip=("samples", "sheet", "sheet_attributes")): - """ - Build representation of object as a dict, recursively - for all objects that might be attributes of self. + Create a K-V pairs for items originally passed in via the sample sheet. - :param object obj: what to serialize to write to YAML. - :param Iterable[str] to_skip: names of attributes to ignore. -\ """ - if isinstance(obj, list): - return [obj2dict(i) for i in obj] - if isinstance(obj, AttributeDict): - return {k: obj2dict(v) for k, v in obj.__dict__.items() - if k not in to_skip and - (k not in ATTRDICT_METADATA or - v != ATTRDICT_METADATA[k])} - elif isinstance(obj, Mapping): - return {k: obj2dict(v) - for k, v in obj.items() if k not in to_skip} - elif isinstance(obj, (Paths, Sample)): - return {k: obj2dict(v) - for k, v in obj.__dict__.items() if k not in to_skip} - elif hasattr(obj, 'dtype'): # numpy data types - # TODO: this fails with ValueError for multi-element array. - return obj.item() - elif _pd.isnull(obj): - # Missing values as evaluated by pd.isnull(). - # This gets correctly written into yaml. - return "NaN" - else: - return obj + This is useful for summarizing; it provides a representation of the + sample that excludes things like config files and derived entries. + + :return OrderedDict: mapping from name to value for data elements + originally provided via the sample sheet (i.e., the a map-like + representation of the instance, excluding derived items) + """ + return _OrderedDict([[k, getattr(self, k)] + for k in self.sheet_attributes]) - # If path is not specified, use default: - # prj.metadata.submission_dir + sample_name + yaml - self.yaml_file = path or \ - _os.path.join(self.prj.metadata.submission_subdir, - self.sample_name + ".yaml") - serial = obj2dict(self) - with open(self.yaml_file, 'w') as outfile: - outfile.write(yaml.safe_dump(serial, default_flow_style=False)) + + def infer_columns(self): + """ + Infer value for additional field(s) from other field(s). + + Add columns/fields to the sample based on values in those already-set + that the sample's project defines as indicative of implications for + additional data elements for the sample. + + :return None: this function mutates state and is strictly for effect + """ + if not hasattr(self.prj, IMPLICATIONS_DECLARATION): + return + + impliers = self.prj[IMPLICATIONS_DECLARATION] + + _LOGGER.debug( + "Sample variable(s) that can imply others: %s", str(impliers)) + for implier_name, implied in impliers.items(): + _LOGGER.debug( + "Setting Sample variable(s) implied by '%s'", implier_name) + try: + implier_value = self[implier_name] + except KeyError: + _LOGGER.debug("No '%s' for this sample", implier_name) + continue + try: + implied_value_by_column = implied[implier_value] + _LOGGER.debug("Implications for '%s' = %s: %s", + implier_name, implier_value, + str(implied_value_by_column)) + for colname, implied_value in \ + implied_value_by_column.items(): + _LOGGER.log(5, "Setting '%s'=%s", + colname, implied_value) + setattr(self, colname, implied_value) + except KeyError: + _LOGGER.log( + 5, "Unknown implied value for implier '%s' = '%s'", + implier_name, implier_value) def locate_data_source(self, column_name=DATA_SOURCE_COLNAME, source_key=None, extra_vars=None): """ - Uses the template path provided in the project config section - "data_sources" to piece together an actual path by substituting + Uses the template path provided in the project config section + "data_sources" to piece together an actual path by substituting variables (encoded by "{variable}"") with sample attributes. - :param str column_name: Name of sample attribute + :param str column_name: Name of sample attribute (equivalently, sample sheet column) specifying a derived column. - :param str source_key: The key of the data_source, - used to index into the project config data_sources section. - By default, the source key will be taken as the value of - the specified column (as a sample attribute). - For cases where the sample doesn't have this attribute yet + :param str source_key: The key of the data_source, + used to index into the project config data_sources section. + By default, the source key will be taken as the value of + the specified column (as a sample attribute). + For cases where the sample doesn't have this attribute yet (e.g. in a merge table), you must specify the source key. - :param dict extra_vars: By default, this will look to - populate the template location using attributes found in the - current sample; however, you may also provide a dict of extra - variables that can also be used for variable replacement. + :param dict extra_vars: By default, this will look to + populate the template location using attributes found in the + current sample; however, you may also provide a dict of extra + variables that can also be used for variable replacement. These extra variables are given a higher priority. :return str: regex expansion of data source specified in configuration, with variable substitutions made @@ -1818,21 +1920,13 @@ def locate_data_source(self, column_name=DATA_SOURCE_COLNAME, return val - def get_genome_transcriptome(self): + def make_sample_dirs(self): """ - Get genome and transcriptome, based on project config file. - If not available (matching config), genome and transcriptome will be set to sample.organism. + Creates sample directory structure if it doesn't exist. """ - try: - self.genome = getattr(self.prj.genomes, self.organism) - except AttributeError: - _LOGGER.debug("Project config lacks genome mapping for " - "organism '%s'", str(self.organism)) - try: - self.transcriptome = getattr(self.prj.transcriptomes, self.organism) - except AttributeError: - _LOGGER.debug("Project config lacks transcriptome mapping for " - "organism '%s'", str(self.organism)) + for path in self.paths: + if not _os.path.exists(path): + _os.makedirs(path) def set_file_paths(self): @@ -1873,71 +1967,6 @@ def set_file_paths(self): pass - def infer_columns(self): - """ - Infer value for additional field(s) from other field(s). - - Add columns/fields to the sample based on values in those already-set - that the sample's project defines as indicative of implications for - additional data elements for the sample. - - :return None: this function mutates state and is strictly for effect - """ - if not hasattr(self.prj, IMPLICATIONS_DECLARATION): - return - - impliers = self.prj[IMPLICATIONS_DECLARATION] - - _LOGGER.debug( - "Sample variable(s) that can imply others: %s", str(impliers)) - for implier_name, implied in impliers.items(): - _LOGGER.debug( - "Setting Sample variable(s) implied by '%s'", implier_name) - try: - implier_value = self[implier_name] - except KeyError: - _LOGGER.debug("No '%s' for this sample", implier_name) - continue - try: - implied_value_by_column = implied[implier_value] - _LOGGER.debug("Implications for '%s' = %s: %s", - implier_name, implier_value, - str(implied_value_by_column)) - for colname, implied_value in \ - implied_value_by_column.items(): - _LOGGER.log(5, "Setting '%s'=%s", - colname, implied_value) - setattr(self, colname, implied_value) - except KeyError: - _LOGGER.log( - 5, "Unknown implied value for implier '%s' = '%s'", - implier_name, implier_value) - - - def make_sample_dirs(self): - """ - Creates sample directory structure if it doesn't exist. - """ - for path in self.paths: - if not _os.path.exists(path): - _os.makedirs(path) - - - def get_sheet_dict(self): - """ - Create a K-V pairs for items originally passed in via the sample sheet. - - This is useful for summarizing; it provides a representation of the - sample that excludes things like config files and derived entries. - - :return OrderedDict: mapping from name to value for data elements - originally provided via the sample sheet (i.e., the a map-like - representation of the instance, excluding derived items) - """ - return _OrderedDict([[k, getattr(self, k)] - for k in self.sheet_attributes]) - - def set_pipeline_attributes( self, pipeline_interface, pipeline_name, permissive=True): """ @@ -1983,85 +2012,6 @@ def set_pipeline_attributes( self.input_file_size = get_file_size(self.all_inputs) - def confirm_required_inputs(self, permissive=False): - - # set_pipeline_attributes must be run first. - if not hasattr(self, "required_inputs"): - _LOGGER.warn("You must run set_pipeline_attributes " - "before confirm_required_inputs") - return True - - if not self.required_inputs: - _LOGGER.debug("No required inputs") - return True - - # First, attributes - for file_attribute in self.required_inputs_attr: - _LOGGER.debug("Checking '{}'".format(file_attribute)) - if not hasattr(self, file_attribute): - message = "Missing required input attribute '{}'".\ - format(file_attribute) - _LOGGER.warn(message) - if not permissive: - raise IOError(message) - else: - return False - if getattr(self, file_attribute) is "": - message = "Empty required input attribute '{}'".\ - format(file_attribute) - _LOGGER.warn(message) - if not permissive: - raise IOError(message) - else: - return False - - # Second, files - missing_files = [] - for paths in self.required_inputs: - # There can be multiple, space-separated values here. - for path in paths.split(" "): - _LOGGER.debug("Checking path: '{}'".format(path)) - if not _os.path.exists(path): - _LOGGER.warn("Missing required input file: '{}'".format(path)) - missing_files.append(path) - - if len(missing_files) > 0: - message = "Missing/unreadable file(s): {}".\ - format(", ".join(["'{}'".format(path) - for path in missing_files])) - if not permissive: - raise IOError(message) - else: - _LOGGER.error(message) - return False - - return True - - - def get_attr_values(self, attrlist): - """ - Get value corresponding to each given attribute. - - :param str attrlist: name of an attribute storing a list of attr names - :return list: value (or empty string) corresponding to each named attr - """ - if not hasattr(self, attrlist): - return None - - attribute_list = getattr(self, attrlist) - - # If attribute is None, then value is also None. - if not attribute_list: - return None - - if not isinstance(attribute_list, list): - attribute_list = [attribute_list] - - # Strings contained here are appended later so shouldn't be null. - return [getattr(self, attr) if hasattr(self, attr) else "" - for attr in attribute_list] - - def set_read_type(self, n=10, permissive=True): """ For a sample with attr `ngs_inputs` set, this sets the @@ -2158,6 +2108,78 @@ def set_read_type(self, n=10, permissive=True): feature, self.name) + def to_yaml(self, path=None, pipeline_name=None): + """ + Serializes itself in YAML format. + + :param str path: A file path to write yaml to. + :param str pipeline_name: name of a pipeline to which this particular + Sample instance pertains (i.e., perhaps the name of a module + that defined a Sample subclass of which this is an instance) + :return str: filepath used (same as input if given, otherwise the + path value that was inferred) + """ + + + def obj2dict(obj, + to_skip=("samples", "sheet", "sheet_attributes")): + """ + Build representation of object as a dict, recursively + for all objects that might be attributes of self. + + :param object obj: what to serialize to write to YAML. + :param Iterable[str] to_skip: names of attributes to ignore. +\ """ + if isinstance(obj, list): + return [obj2dict(i) for i in obj] + if isinstance(obj, AttributeDict): + return {k: obj2dict(v) for k, v in obj.__dict__.items() + if k not in to_skip and + (k not in ATTRDICT_METADATA or + v != ATTRDICT_METADATA[k])} + elif isinstance(obj, Mapping): + return {k: obj2dict(v) + for k, v in obj.items() if k not in to_skip} + elif isinstance(obj, (Paths, Sample)): + return {k: obj2dict(v) + for k, v in obj.__dict__.items() if + k not in to_skip} + elif hasattr(obj, 'dtype'): # numpy data types + # TODO: this fails with ValueError for multi-element array. + return obj.item() + elif _pd.isnull(obj): + # Missing values as evaluated by pd.isnull(). + # This gets correctly written into yaml. + return "NaN" + else: + return obj + + + # Determine filepath, prioritizing anything given, then falling + # back to a default using this Sample's Project's submission_subdir. + # Use the sample name and YAML extension as the file name, + # interjecting a pipeline name as a subfolder within the Project's + # submission_subdir if such a pipeline name is provided. + if not path: + submission_dirpath = self.prj.metadata.submission_subdir + filename = "{}.{}.yaml".format(self.sample_name, pipeline_name) \ + if pipeline_name else "{}.yaml".format(self.sample_name) + path = _os.path.join(submission_dirpath, filename) + self.yaml_file = path + + serial = obj2dict(self) + with open(self.yaml_file, 'w') as outfile: + outfile.write(yaml.safe_dump(serial, default_flow_style=False)) + + + def update(self, newdata): + """ + Update Sample object with attributes from a dict. + """ + for key, value in newdata.items(): + setattr(self, key, value) + + @classmethod def select_sample_subtype(cls, pipeline_filepath, protocol=None): """ @@ -2171,10 +2193,10 @@ def select_sample_subtype(cls, pipeline_filepath, protocol=None): :param str pipeline_filepath: path to file defining a pipeline :param str protocol: name of protocol for which to select Sample subtype - :return type: Sample type most tailored to indicated protocol and defined - within the module indicated by the given filepath, optional; if - unspecified, or if the indicated file cannot be imported, then the - base Sample type is returned. Critically, the indicated + :return type: Sample type most tailored to indicated protocol and + defined within the module indicated by the given filepath, + optional; if unspecified, or if the indicated file cannot be + imported, then the base Sample type is returned. """ if not _os.path.isfile(pipeline_filepath): From f1e947f3dab7fa826fced78d5414e2c73cd85b6f Mon Sep 17 00:00:00 2001 From: Vince Reuter Date: Sun, 18 Jun 2017 14:28:55 -0400 Subject: [PATCH 30/94] need to finalize pipelines directory before processing locations; better messaging --- looper/models.py | 26 ++++++++++++++++++-------- 1 file changed, 18 insertions(+), 8 deletions(-) diff --git a/looper/models.py b/looper/models.py index c747a614..f9a6e6da 100644 --- a/looper/models.py +++ b/looper/models.py @@ -311,9 +311,15 @@ def __repr__(self): def process_pipeline_interfaces(pipeline_interface_locations): """ + Create a ProtocolInteraface for each pipeline location given. - :param pipeline_interface_locations: - :return: + :param Iterable[str] pipeline_interface_locations: locations, each of + which should be either a directory path or a filepath, that specifies + pipeline interace and protocol mappings information. Each such file + should be have a pipelines section and a protocol mappings section + whereas each folder should have a file for each of those sections. + :return Mapping[str, ProtocolInterace]: mapping from protocol name to + interface(s) for which that protocol is mapped """ ifproto_by_proto_name = defaultdict(list) for pipe_iface_location in pipeline_interface_locations: @@ -553,10 +559,13 @@ def __init__(self, config_file, subproject=None, # SampleSheet creation populates project's samples, adds the # sheet itself, and adds any derived columns. + _LOGGER.debug("Processing {} pipeline location(s): {}". + format(len(self.metadata.pipelines_dir), + self.metadata.pipelines_dir)) + self.finalize_pipelines_directory() self.interfaces_by_protocol = \ process_pipeline_interfaces(self.metadata.pipelines_dir) self.sheet = check_sheet(self.metadata.sample_annotation) - self.finalize_pipelines_directory() # Defer Sample creation until needed. self._samples_by_pipeline = {} @@ -989,7 +998,8 @@ def parse_config_file(self, subproject=None): if "pipelines_dir" in self.metadata: _LOGGER.warning("Looper v0.6 suggests " "switching from pipelines_dir to " - " pipeline_interfaces. See docs for details.") + "pipeline_interfaces. See docs for details: " + "http://looper.readthedocs.io/en/latest/") if "pipeline_interfaces" in self.metadata: if "pipelines_dir" in self.metadata: raise AttributeError( @@ -997,8 +1007,8 @@ def parse_config_file(self, subproject=None): "'pipelines_dir'. Please remove your " "'pipelines_dir' definition.") else: - self.metadata.pipelines_dir = self.metadata.pipeline_interfaces - + self.metadata.pipelines_dir = \ + self.metadata.pipeline_interfaces _LOGGER.debug("Adding pipeline_interfaces to " "pipelines_dir. New value: {}". format(self.metadata.pipelines_dir)) @@ -2300,8 +2310,8 @@ def __init__(self, config): self.pipe_iface_config = config else: - _LOGGER.debug("Parsing '%s' for %s config data", - config, self.__class__.__name__) + _LOGGER.debug("Parsing '%s' for PipelineInterface config data", + config) self.pipe_iface_file = config with open(config, 'r') as f: self.pipe_iface_config = yaml.load(f) From eaccdffc9c7bc5d9dc88d94c1496f1cb07cf6ce8 Mon Sep 17 00:00:00 2001 From: Vince Reuter Date: Mon, 19 Jun 2017 02:38:21 -0400 Subject: [PATCH 31/94] Project representation restrictions; num_samples not working --- looper/looper.py | 12 ++++++------ looper/models.py | 32 +++++++++++++++++++++++++++----- 2 files changed, 33 insertions(+), 11 deletions(-) diff --git a/looper/looper.py b/looper/looper.py index 149a55a6..953cf14c 100755 --- a/looper/looper.py +++ b/looper/looper.py @@ -183,7 +183,7 @@ def run(prj, args, remaining_args): to be passed on to parser(s) elsewhere """ - _start_counter(len(prj.samples)) + _start_counter(prj.num_samples) valid_read_types = ["single", "paired"] # Keep track of how many jobs have been submitted. @@ -429,7 +429,7 @@ def summarize(prj): columns = [] stats = [] - _start_counter(len(prj.samples)) + _start_counter(prj.num_samples) for sample in prj.samples: _LOGGER.info(_COUNTER.show(sample.sample_name, sample.library)) @@ -491,7 +491,7 @@ def destroy(prj, args, preview_flag=True): _LOGGER.info("Results to destroy:") - _start_counter(len(prj.samples)) + _start_counter(prj.num_samples) for sample in prj.samples: _LOGGER.info(_COUNTER.show(sample.sample_name, sample.library)) @@ -528,7 +528,7 @@ def clean(prj, args, preview_flag=True): _LOGGER.info("Files to clean:") - _start_counter(len(prj.samples)) + _start_counter(prj.num_samples) for sample in prj.samples: _LOGGER.info(_COUNTER.show(sample.sample_name, sample.library)) @@ -813,9 +813,9 @@ def main(): raise AttributeError( "Looper requires at least one pipeline(s) location.") - if not prj.interface_manager.ifproto_by_proto_name: + if not prj.interfaces_by_protocol: _LOGGER.error( - "The interface manager is empty. Does your project point " + "The Project knows no protocols. Does it point " "to at least one pipelines location that exists?") return try: diff --git a/looper/models.py b/looper/models.py index f9a6e6da..9cf4dedb 100644 --- a/looper/models.py +++ b/looper/models.py @@ -104,6 +104,12 @@ def is_url(maybe_url): +def include_in_repr(attr, klazz): + return attr not in \ + {"Project": ["sheet", "interfaces_by_protocol"]}[klazz.__name__] + + + @copy class Paths(object): """ A class to hold paths as attributes. """ @@ -187,8 +193,7 @@ def __setattr__(self, key, value): def __getattr__(self, item, default=None): """ - Fetch the value associated with the provided identifier. Unlike an - ordinary object, `AttributeDict` supports fetching + Fetch the value associated with the provided identifier. :param int | str item: identifier for value to fetch :return object: whatever value corresponds to the requested key/item @@ -200,6 +205,10 @@ def __getattr__(self, item, default=None): anyway. More specifically, respect attribute naming that appears to be indicative of the intent of protection. """ + try: + return super(AttributeDict, self).__getattribute__(item) + except AttributeError: + pass try: # Fundamentally, this is still a mapping; # route object notation access pattern accordingly. @@ -311,7 +320,7 @@ def __repr__(self): def process_pipeline_interfaces(pipeline_interface_locations): """ - Create a ProtocolInteraface for each pipeline location given. + Create a ProtocolInterface for each pipeline location given. :param Iterable[str] pipeline_interface_locations: locations, each of which should be either a directory path or a filepath, that specifies @@ -566,9 +575,12 @@ def __init__(self, config_file, subproject=None, self.interfaces_by_protocol = \ process_pipeline_interfaces(self.metadata.pipelines_dir) self.sheet = check_sheet(self.metadata.sample_annotation) + self.merge_table = None - # Defer Sample creation until needed. - self._samples_by_pipeline = {} + + def __repr__(self): + include = partial(include_in_repr, klazz=self.__class__) + return repr({k: v for k, v in self.__dict__.items() if include(k)}) @property @@ -589,6 +601,12 @@ def default_compute_envfile(self): self.templates_folder, "default_compute_settings.yaml") + @property + def num_samples(self): + """ Number of samples available in this Project. """ + return sum(1 for _ in self.samples) + + @property def output_dir(self): """ @@ -2597,6 +2615,10 @@ def __init__(self, pipedir): "as a file nor as a folder.".format(pipedir)) + def __repr__(self): + return repr(self.__dict__) + + def pipeline_key_to_path(self, pipeline_key): """ Given a pipeline_key, return the path to the script for that pipeline From f96a7e2c5f4162e4fb0ae41d496966843c68ce68 Mon Sep 17 00:00:00 2001 From: Vince Reuter Date: Mon, 19 Jun 2017 02:41:54 -0400 Subject: [PATCH 32/94] num_samples working; it's samples itself that's problematic, via df attr --- looper/models.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/looper/models.py b/looper/models.py index 9cf4dedb..8d24e0f0 100644 --- a/looper/models.py +++ b/looper/models.py @@ -604,7 +604,7 @@ def default_compute_envfile(self): @property def num_samples(self): """ Number of samples available in this Project. """ - return sum(1 for _ in self.samples) + return sum(1 for _ in self.sample_names) @property From 5c382e290a5dfd8a958ad45c666dff8005682565 Mon Sep 17 00:00:00 2001 From: Vince Reuter Date: Mon, 19 Jun 2017 02:45:32 -0400 Subject: [PATCH 33/94] bump up binding of Project to Sample --- looper/models.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/looper/models.py b/looper/models.py index 8d24e0f0..f2ddc6ff 100644 --- a/looper/models.py +++ b/looper/models.py @@ -707,8 +707,9 @@ def merge(s): def merge(s): return merge_sample(s, self.merge_table, self.derived_columns) - for _, row in self.sheet.df.iterrows(): + for _, row in self.sheet.iterrows(): sample = Sample(row.dropna()) + sample.prj = self if hasattr(sample, "organism"): sample.get_genome_transcriptome() sample.set_file_paths() @@ -720,7 +721,6 @@ def merge(s): _LOGGER.debug("Sample '%s' lacks data source --> skipping " "data path assignment", sample.sample_name) sample = merge(sample) - sample.prj = self yield sample From fa7989e0456cad1934fa183c6a9a4ea615f5dea3 Mon Sep 17 00:00:00 2001 From: Vince Reuter Date: Mon, 19 Jun 2017 16:37:50 -0400 Subject: [PATCH 34/94] cutting the Sample-to-Project cord; removing functions that have been replaced; better repr functions; genome/transcriptome setters --- looper/looper.py | 19 +- looper/models.py | 616 +++++++++++++---------------------------------- looper/utils.py | 11 +- 3 files changed, 190 insertions(+), 456 deletions(-) diff --git a/looper/looper.py b/looper/looper.py index 953cf14c..83e8a367 100755 --- a/looper/looper.py +++ b/looper/looper.py @@ -14,7 +14,7 @@ import pandas as _pd from . import setup_looper_logger, LOGGING_LEVEL, __version__ from .loodels import Project -from .models import COMPUTE_SETTINGS_VARNAME +from .models import Sample, COMPUTE_SETTINGS_VARNAME from .utils import VersionInHelpParser try: @@ -248,7 +248,9 @@ def run(prj, args, remaining_args): # TODO: determine what to do with subtype(s) here. # Processing preconditions have been met. processed_samples.add(sample.sample_name) - sample.to_yaml() + _LOGGER.debug("Writing base Sample representation to disk: '%s'", + sample.sample_name) + sample.to_yaml(subs_folder_path=prj.metadata.submission_subdir) sample_data = sample.as_series().to_dict() # Go through all pipelines to submit for this protocol. @@ -262,8 +264,13 @@ def run(prj, args, remaining_args): _LOGGER.debug("Creating %s instance for sample '%s'", sample_subtype.__name__, sample.sample_name) sample = sample_subtype(sample_data) - pipeline_name, _ = os.path.splitext(pipeline_key) - sample.to_yaml(pipeline_name=pipeline_name) + if sample_subtype != Sample: + # Only rewrite the file if we have a proper subtype. + pipeline_name, _ = os.path.splitext(pipeline_key) + _LOGGER.debug("Representing sample '%s' on disk as %s", + sample.sample_name, sample_subtype.__name__) + sample.to_yaml(subs_folder_path=prj.metadata.submission_subdir, + pipeline_name=pipeline_name) # The current sample is active. # For each pipeline submission consideration, start fresh. @@ -662,7 +669,9 @@ def cluster_submit( handle.write(filedata) # Prepare and write sample yaml object - sample.to_yaml() + _LOGGER.debug("Writing sample '%s' representation to disk", + sample.sample_name) + sample.to_yaml(subs_folder_path=submission_folder) # Check if job is already submitted (unless ignore_flags is set to True) if not ignore_flags: diff --git a/looper/models.py b/looper/models.py index f2ddc6ff..2538db99 100644 --- a/looper/models.py +++ b/looper/models.py @@ -76,6 +76,7 @@ SAMPLE_NAME_COLNAME = "sample_name" SAMPLE_ANNOTATIONS_KEY = "sample_annotation" IMPLICATIONS_DECLARATION = "implied_columns" +DATA_SOURCES_SECTION = "data_sources" COL_KEY_SUFFIX = "_key" ATTRDICT_METADATA = {"_force_nulls": False, "_attribute_identity": False} @@ -110,6 +111,32 @@ def include_in_repr(attr, klazz): +class PepYamlRepresenter(yaml.representer.Representer): + """ Should object's YAML representation fail, get additional info. """ + + def represent_data(self, data): + """ + Supplement PyYAML's context info in case of representation failure. + + :param object data: same as superclass + :return object: same as superclass + """ + try: + return super(PepYamlRepresenter, self).represent_data(data) + except yaml.representer.RepresenterError: + _LOGGER.error("YAML representation error: {} ({})". + format(data, type(data))) + raise + + +# Bespoke YAML dumper, using the custom data/object Representer. +PepYamlDumper = type("PepYamlDumper", + (yaml.emitter.Emitter, yaml.serializer.Serializer, + PepYamlRepresenter, yaml.resolver.Resolver), + dict(yaml.dumper.Dumper.__dict__)) + + + @copy class Paths(object): """ A class to hold paths as attributes. """ @@ -131,7 +158,7 @@ def __iter__(self): """ return iter(self.__dict__.values()) - def __str__(self): + def __repr__(self): return "Paths object." @@ -351,10 +378,11 @@ def process_pipeline_interfaces(pipeline_interface_locations): -def merge_sample(sample, merge_table, derived_columns): +def merge_sample(data_sources, sample, merge_table, derived_columns): """ Use merge table data to augment/modify Sample. + :param Mapping data_sources: collection of named paths to data locations :param Sample sample: sample to modify via merge table data :param merge_table: data with which to alter Sample :param derived_columns: names of columns with data-derived value @@ -391,7 +419,7 @@ def merge_sample(sample, merge_table, derived_columns): merged_cols[col_key] = "" row_dict[col_key] = row_dict[col] row_dict[col] = sample.locate_data_source( - col, row_dict[col], row_dict) # 1) + data_sources, col, row_dict[col], row_dict) # 1) # Also add in any derived cols present. for col in derived_columns: @@ -408,7 +436,7 @@ def merge_sample(sample, merge_table, derived_columns): # Map the column name itself to the # populated data source template string. row_dict[col] = sample.locate_data_source( - col, getattr(sample, col), row_dict) + data_sources, col, getattr(sample, col), row_dict) _LOGGER.debug("PROBLEM adding derived column: " "{}, {}, {}".format(col, row_dict[col], getattr(sample, col))) @@ -709,10 +737,10 @@ def merge(s): for _, row in self.sheet.iterrows(): sample = Sample(row.dropna()) - sample.prj = self - if hasattr(sample, "organism"): - sample.get_genome_transcriptome() - sample.set_file_paths() + sample.set_genome(self.genomes) + sample.set_transcriptome(self.transcriptomes) + + sample.set_file_paths(self) # Hack for backwards-compatibility # Pipelines should now use `data_source`) try: @@ -846,7 +874,7 @@ def build_pipelines(self, protocol, priority=True): _LOGGER.debug("{} new scripts for protocol {} from " "pipelines warehouse '{}': {}". format(len(new_scripts), protocol, - proto_iface.pipedir, ", ".join(new_scripts))) + proto_iface.location, ", ".join(new_scripts))) new_jobs = [] for pipeline_key in new_scripts: @@ -1198,143 +1226,6 @@ def update_environment(self, env_settings_file): self.environment_file = env_settings_file - # TODO: remove once confident in replacement. - def add_sample_sheet(self, csv=None, sample_builder=None): - """ - Build a `SampleSheet` object from a csv file and - add it and its samples to the project. - - :param csv: Path to csv file. - :type csv: str - :param sample_builder: how to create single Sample from raw input data. - :type sample_builder: function(pandas.Series | dict) -> Sample - """ - - _LOGGER.debug("Adding sample sheet") - - # Make SampleSheet object - # By default read sample_annotation, but allow explict CSV arg. - self.sheet = SampleSheet(csv or self.metadata.sample_annotation) - - # Pair project and sheet. - self.sheet.prj = self - - # Generate sample objects from annotation sheet. - _LOGGER.debug("Creating samples from annotation sheet") - self.sheet.make_samples(sample_builder) - - # Add samples to Project. - for sample in self.sheet.samples: - # Overwritten later if merged - sample.merged = False - # Tie sample and project bilaterally - sample.prj = self - self.samples.append(sample) - - # Merge sample files (!) using merge table if provided: - if hasattr(self.metadata, "merge_table"): - if self.metadata.merge_table is not None: - if _os.path.isfile(self.metadata.merge_table): - # read in merge table - - merge_table = _pd.read_table( - self.metadata.merge_table, - sep=None, index_col=False, engine="python") - - if SAMPLE_NAME_COLNAME not in merge_table.columns: - raise KeyError( - "Merge table requires a column named '{}'.". - format(SAMPLE_NAME_COLNAME)) - - for sample in self.sheet.samples: - sample_indexer = \ - merge_table[SAMPLE_NAME_COLNAME] == sample.name - merge_rows = merge_table[sample_indexer] - - # Check if there are rows in the - # merge table for this sample: - if len(merge_rows) > 0: - # For each row in the merge table of this sample: - # 1) populate any derived columns - # 2) derived columns --> space-delimited strings - # 3) update the sample values with the merge table - - # Keep track of merged cols, - # so we don't re-derive them later. - merged_cols = { - key: "" for key in merge_rows.columns} - for _, row in merge_rows.iterrows(): - row_dict = row.to_dict() - for col in merge_rows.columns: - if col == SAMPLE_NAME_COLNAME or \ - col not in self.derived_columns: - continue - # Initialize key in parent dict. - col_key = col + COL_KEY_SUFFIX - merged_cols[col_key] = "" - row_dict[col_key] = row_dict[col] - row_dict[col] = sample.locate_data_source( - col, row_dict[col], row_dict) # 1) - - # Also add in any derived cols present. - for col in self.derived_columns: - # Skip over attributes that the sample - # either lacks, and those covered by the - # data from the current (row's) data. - if not hasattr(sample, col) or \ - col in row_dict: - continue - # Map column name key to sample's value - # for the attribute given by column name. - col_key = col + COL_KEY_SUFFIX - row_dict[col_key] = getattr(sample, col) - # Map the column name itself to the - # populated data source template string. - row_dict[col] = sample.locate_data_source( - col, getattr(sample, col), row_dict) - _LOGGER.debug( - "PROBLEM adding derived column: " - "{}, {}, {}".format(col, - row_dict[col], getattr(sample, col))) - - # Since we are now jamming multiple (merged) - # entries into a single attribute, we have to - # join them into a space-delimited string - # and then set to sample attribute. - for key, val in row_dict.items(): - if key == SAMPLE_NAME_COLNAME or not val: - continue - _LOGGER.debug("merge: sample '%s'; %s=%s", - str(sample.name), - str(key), str(val)) - if not key in merged_cols: - new_val = str(val).rstrip() - else: - new_val = "{} {}".format( - merged_cols[key], str(val)).strip() - merged_cols[key] = new_val # 2) - - # Don't update sample_name. - merged_cols.pop(SAMPLE_NAME_COLNAME, None) - - sample.update(merged_cols) # 3) - sample.merged = True # mark sample as merged - sample.merged_cols = merged_cols - - # With all samples, prepare file paths. - for sample in self.sheet.samples: - if hasattr(sample, "organism"): - sample.get_genome_transcriptome() - sample.set_file_paths() - # Hack for backwards-compatibility - # Pipelines should now use `data_source`) - try: - sample.data_path = sample.data_source - except AttributeError: - _LOGGER.debug("Sample '%s' lacks data source --> skipping " - "data path assignment", sample.sample_name) - - def _ensure_absolute(self, maybe_relpath): """ Ensure that a possibly relative path is absolute. """ _LOGGER.debug("Ensuring absolute: '%s'", maybe_relpath) @@ -1393,221 +1284,6 @@ def check_sheet(sample_file, dtype=str): return df -@copy -class SampleSheet(object): - """ - Class to model a sample annotation sheet. - - :param path: Path to sample file. - :type path: str - :param dtype: Data type to read sample file as. Default is str. - :type dtype: type - - :Example: - - .. code-block:: python - - from models import Project, SampleSheet - prj = Project("config.yaml") - sheet = SampleSheet("sheet.csv") - """ - - def __init__(self, path, dtype=str): - super(SampleSheet, self).__init__() - self.df = self.check_sheet(path, dtype) - self.path = path - self.samples = list() - - def __repr__(self): - if hasattr(self, "prj"): - return "SampleSheet for project '%s' with %i samples." % \ - (self.prj, len(self.df)) - else: - return "SampleSheet with %i samples." % len(self.df) - - - @staticmethod - def alpha_cased(text, lower=False): - """ - Filter text to just letters and homogenize case. - - :param str text: what to filter and homogenize. - :param bool lower: whether to convert to lowercase; default uppercase. - :return str: input filtered to just letters, with homogenized case. - """ - text = "".join(filter(lambda c: c.isalpha(), text)) - return text.lower() if lower else text.upper() - - - def make_samples(self, sample_builder=None): - """ - Create samples (considering library) from annotation sheet, - and add them to the project. - """ - create_sample = sample_builder or self._find_sample_subtypes() - for _, row in self.df.iterrows(): - self.samples.append(create_sample(row.dropna())) - - - def _find_sample_subtypes(self): - """ - Determine how to create Sample instances. - - Search modules for classes that extend Sample in order to find - those that are more specifically tailored to a particular - data or experiment type. - - :return function(Mapping | pd.core.series.Series) -> Sample: function - that takes input data and creates a Sample (or perhaps a subclass). - """ - try: - import pipelines # Use a pipelines package if installed. - except ImportError: - # pipelines_dir is optional. - pipeline_dirpaths = getattr( - self.prj.metadata, "pipelines_dir", None) - if pipeline_dirpaths: - if isinstance(pipeline_dirpaths, str): - pipeline_dirpaths = [pipeline_dirpaths] - sys.path.extend(pipeline_dirpaths) - _LOGGER.debug( - "Added {} pipelines path(s) to sys.path: {}". - format(len(pipeline_dirpaths), pipeline_dirpaths)) - else: - _LOGGER.debug("No pipelines directories to add to import path") - try: - import pipelines - except ImportError: - found_pipelines = False - else: - found_pipelines = True - else: - found_pipelines = True - - if not found_pipelines: - _LOGGER.debug("Could not import pipelines") - # Just return a basic Sample for each of the sheet's rows. - def make_sample(data): - return Sample(data) - else: - _LOGGER.debug("Successfully imported pipelines") - # Attempt creation of Sample subtype specific to protocol. - - # Get all pipelines package Sample subclasses. - import inspect - from utils import fetch_package_classes - sample_types = fetch_package_classes(pipelines, - lambda maybe_class: inspect.isclass(maybe_class) - and issubclass(maybe_class, Sample)) - - _LOGGER.debug("Sample subtypes: %s", - ", ".join([subtype.__name__ - for subtype in sample_types])) - - # TODO: perhaps modify or alter handling of need for __library__. - pairing = {self.alpha_cased(sample_class.__library__): sample_class - for sample_type, sample_class in sample_types} - - def make_sample(data): - # Create the most specific Sample type possible. - try: - protocol = data.library - except AttributeError: - _LOGGER.debug("Sample data lacks 'library' attribute") - return Sample(data) - try: - return pairing[self.alpha_cased(protocol)](data) - except KeyError: - _LOGGER.debug("Unknown protocol: '{}'; known: {}". - format(protocol, pairing.keys())) - return Sample(data) - - return make_sample - - - - def protocol_to_subclass(self): - try: - import pipelines # Use a pipelines package if installed. - except ImportError: - # pipelines_dir is optional. - pipeline_dirpaths = getattr( - self.prj.metadata, "pipelines_dir", None) - - if not pipeline_dirpaths: - _LOGGER.debug("No pipelines directories to add to import path") - return None - - if isinstance(pipeline_dirpaths, str): - pipeline_dirpaths = [pipeline_dirpaths] - sys.path.extend(pipeline_dirpaths) - _LOGGER.debug( - "Added {} pipelines path(s) to sys.path: {}". - format(len(pipeline_dirpaths), pipeline_dirpaths)) - - try: - import pipelines - except ImportError: - _LOGGER.debug("Could not import pipelines") - return None - - _LOGGER.debug("Successfully imported pipelines") - - # Get all pipelines package Sample subclasses. - import inspect - from utils import fetch_package_classes - sample_types = fetch_package_classes(pipelines, - lambda maybe_class: inspect.isclass(maybe_class) - and issubclass(maybe_class, Sample)) - - # TODO: perhaps modify or alter handling of need for __library__. - return {self.alpha_cased(sample_class.__library__): sample_class - for sample_type, sample_class in sample_types} - - - def as_data_frame(self): - """ - Returns a `pandas.DataFrame` representation of self. - """ - return _pd.DataFrame([s.as_series() for s in self.samples]) - - - def write(self, path, sep=None): - """ - Saves an annotation sheet from the samples. - - :param path: Path to file to be written. - :type path: str - :param sep: Delimiter to use in the file written. - :type sep: str - - :Example: - - .. code-block:: python - - from models import SampleSheet - sheet = SampleSheet("/projects/example/sheet.csv") - sheet.write("~/projects/example/sheet2.csv") - """ - - valid_types = [".txt", ".tsv", ".csv"] - - # Infer delimiter if needed. - if sep is None: - file_type = _os.path.splitext(path)[1].lower() - if file_type not in valid_types: - help_msg = "Provide an argument for parameter 'sep' or pass a " \ - "filepath with an extension in: {}".\ - format(valid_types) - raise ValueError(help_msg) - sep = "," if file_type == ".csv" else "\t" - - # Convert to frame and write to disk. - with open(path, 'w') as sheetfile: - # TODO: decide which--if any--attributes to drop here. - self.as_data_frame().to_csv(sheetfile, sep=sep, index=False) - - @copy class Sample(object): @@ -1804,24 +1480,6 @@ def get_attr_values(self, attrlist): for attr in attribute_list] - def get_genome_transcriptome(self): - """ - Get genome and transcriptome, based on project config file. - If not available (matching config), genome and transcriptome - will be set to sample.organism. - """ - try: - self.genome = getattr(self.prj.genomes, self.organism) - except AttributeError: - _LOGGER.debug("Project config lacks genome mapping for " - "organism '%s'", str(self.organism)) - try: - self.transcriptome = getattr(self.prj.transcriptomes, self.organism) - except AttributeError: - _LOGGER.debug("Project config lacks transcriptome mapping for " - "organism '%s'", str(self.organism)) - - def get_sheet_dict(self): """ Create a K-V pairs for items originally passed in via the sample sheet. @@ -1837,7 +1495,7 @@ def get_sheet_dict(self): for k in self.sheet_attributes]) - def infer_columns(self): + def infer_columns(self, implications): """ Infer value for additional field(s) from other field(s). @@ -1845,16 +1503,16 @@ def infer_columns(self): that the sample's project defines as indicative of implications for additional data elements for the sample. + :param Mapping implications: Project's implied columns data :return None: this function mutates state and is strictly for effect """ - if not hasattr(self.prj, IMPLICATIONS_DECLARATION): - return - - impliers = self.prj[IMPLICATIONS_DECLARATION] _LOGGER.debug( - "Sample variable(s) that can imply others: %s", str(impliers)) - for implier_name, implied in impliers.items(): + "Sample variable(s) that can imply others: %s", str(implications)) + if not implications: + return + + for implier_name, implied in implications.items(): _LOGGER.debug( "Setting Sample variable(s) implied by '%s'", implier_name) try: @@ -1878,13 +1536,15 @@ def infer_columns(self): implier_name, implier_value) - def locate_data_source(self, column_name=DATA_SOURCE_COLNAME, + def locate_data_source(self, data_sources, column_name=DATA_SOURCE_COLNAME, source_key=None, extra_vars=None): """ Uses the template path provided in the project config section "data_sources" to piece together an actual path by substituting variables (encoded by "{variable}"") with sample attributes. + :param Mapping data_sources: mapping from key name (as a value in + a cell of a tabular data structure) to, e.g., filepath :param str column_name: Name of sample attribute (equivalently, sample sheet column) specifying a derived column. :param str source_key: The key of the data_source, @@ -1900,9 +1560,12 @@ def locate_data_source(self, column_name=DATA_SOURCE_COLNAME, These extra variables are given a higher priority. :return str: regex expansion of data source specified in configuration, with variable substitutions made + :raises ValueError: if argument to data_sources parameter is null/empty """ - sources_section = "data_sources" + if not data_sources: + # TODO: should this be a null/empty-string return, or actual error? + raise ValueError("No data sources") if not source_key: try: @@ -1911,11 +1574,11 @@ def locate_data_source(self, column_name=DATA_SOURCE_COLNAME, reason = "'{attr}': to locate sample's data source, provide " \ "the name of a key from '{sources}' or ensure " \ "sample has attribute '{attr}'".format( - attr=column_name, sources=sources_section) + attr=column_name, sources=DATA_SOURCES_SECTION) raise AttributeError(reason) try: - regex = self.prj[sources_section][source_key] + regex = data_sources[source_key] except KeyError: _LOGGER.warn( "Config lacks entry for data_source key: '{}' " @@ -1957,14 +1620,16 @@ def make_sample_dirs(self): _os.makedirs(path) - def set_file_paths(self): + def set_file_paths(self, project): """ Sets the paths of all files for this sample. + + :param Project project: object with pointers to data paths and such """ # Any columns specified as "derived" will be constructed # based on regex in the "data_sources" section of project config. - for col in self.prj.derived_columns: + for col in project.derived_columns: # Only proceed if the specified column exists # and was not already merged or derived. if hasattr(self, col) and col not in self.merged_cols \ @@ -1972,28 +1637,52 @@ def set_file_paths(self): # Set a variable called {col}_key, so the # original source can also be retrieved. setattr(self, col + COL_KEY_SUFFIX, getattr(self, col)) - setattr(self, col, self.locate_data_source(col)) + setattr(self, col, self.locate_data_source( + data_sources=project.get(DATA_SOURCES_SECTION), + column_name=col)) self.derived_cols_done.append(col) - self.infer_columns() + self.infer_columns(implications=project.get(IMPLICATIONS_DECLARATION)) # Parent - self.results_subdir = self.prj.metadata.results_subdir + self.results_subdir = project.metadata.results_subdir self.paths.sample_root = _os.path.join( - self.prj.metadata.results_subdir, self.sample_name) + project.metadata.results_subdir, self.sample_name) # Track url bigwig_filename = self.name + ".bigWig" try: # Project's public_html folder self.bigwig = _os.path.join( - self.prj.trackhubs.trackhub_dir, bigwig_filename) + project.trackhubs.trackhub_dir, bigwig_filename) self.track_url = \ - "{}/{}".format(self.prj.trackhubs.url, bigwig_filename) + "{}/{}".format(project.trackhubs.url, bigwig_filename) except: _LOGGER.debug("No trackhub/URL") pass + + def set_genome(self, genomes): + self._set_assembly("genome", genomes) + + + def set_transcriptome(self, transcriptomes): + self._set_assembly("transcriptome", transcriptomes) + + + def _set_assembly(self, ome, assemblies): + try: + assembly = assemblies[self.organism] + except AttributeError: + _LOGGER.debug("Sample '%s' lacks organism attribute", self.name) + assembly = None + except KeyError: + _LOGGER.debug("Unknown {} value: '{}'".format(ome, self.organism)) + assembly = None + _LOGGER.debug("Setting {} as {} on sample '{}'". + format(assembly, ome, self.name)) + setattr(self, ome, assembly) + def set_pipeline_attributes( self, pipeline_interface, pipeline_name, permissive=True): @@ -2136,18 +1825,39 @@ def set_read_type(self, n=10, permissive=True): feature, self.name) - def to_yaml(self, path=None, pipeline_name=None): + def to_yaml(self, path=None, subs_folder_path=None, pipeline_name=None): """ Serializes itself in YAML format. - :param str path: A file path to write yaml to. + :param str path: A file path to write yaml to; provide this or + the subs_folder_path :param str pipeline_name: name of a pipeline to which this particular Sample instance pertains (i.e., perhaps the name of a module that defined a Sample subclass of which this is an instance) + :param str subs_folder_path: path to folder in which to place file + that's being written; provide this or a full filepath :return str: filepath used (same as input if given, otherwise the path value that was inferred) + :raises ValueError: if neither full filepath nor path to extant + parent directory is provided. """ + # Determine filepath, prioritizing anything given, then falling + # back to a default using this Sample's Project's submission_subdir. + # Use the sample name and YAML extension as the file name, + # interjecting a pipeline name as a subfolder within the Project's + # submission_subdir if such a pipeline name is provided. + if not path: + if not subs_folder_path: + raise ValueError( + "To represent {} on disk, provide a full path or a path " + "to a parent (submissions) folder". + format(self.__class__.__name__)) + filename = "{}.{}.yaml".format(self.sample_name, pipeline_name) \ + if pipeline_name else "{}.yaml".format(self.sample_name) + path = _os.path.join(subs_folder_path, filename) + self.yaml_file = path + def obj2dict(obj, to_skip=("samples", "sheet", "sheet_attributes")): @@ -2182,22 +1892,13 @@ def obj2dict(obj, else: return obj - - # Determine filepath, prioritizing anything given, then falling - # back to a default using this Sample's Project's submission_subdir. - # Use the sample name and YAML extension as the file name, - # interjecting a pipeline name as a subfolder within the Project's - # submission_subdir if such a pipeline name is provided. - if not path: - submission_dirpath = self.prj.metadata.submission_subdir - filename = "{}.{}.yaml".format(self.sample_name, pipeline_name) \ - if pipeline_name else "{}.yaml".format(self.sample_name) - path = _os.path.join(submission_dirpath, filename) - self.yaml_file = path - + _LOGGER.debug("Converting Sample '%s' to dictionary", self.name) serial = obj2dict(self) with open(self.yaml_file, 'w') as outfile: - outfile.write(yaml.safe_dump(serial, default_flow_style=False)) + _LOGGER.debug("Converting '%s' dictionary to YAML data", self.name) + yaml_data = yaml.safe_dump(serial, default_flow_style=False) + #yaml_data = yaml.dump(serial, Dumper=PepYamlDumper, default_flow_style=False) + outfile.write(yaml_data) def update(self, newdata): @@ -2256,8 +1957,14 @@ def file_has_pattern(pattern, filepath): # Import pipeline module and find Sample subtypes. _, modname = _os.path.split(pipeline_filepath) modname, _ = _os.path.splitext(modname) - pipeline_module = import_from_source( - name=modname, module_filepath=pipeline_filepath) + try: + pipeline_module = import_from_source( + name=modname, module_filepath=pipeline_filepath) + except ImportError as e: + _LOGGER.warn("Using base Sample because of failure in attempt to " + "import pipeline module: {}".format(e)) + return cls + _LOGGER.debug("Successfully imported pipeline module '%s', " "naming it '%s'", pipeline_filepath, pipeline_module.__name__) @@ -2340,7 +2047,11 @@ def __iter__(self): def __repr__(self): - return repr(self.pipe_iface_config) + source = self.pipe_iface_file or "mapping" + num_pipelines = len(self.pipe_iface_config) + pipelines = ", ".join(self.pipe_iface_config.keys()) + return "{} from {}, with {} pipeline(s): {}".format( + self.__class__.__name__, source, num_pipelines, pipelines) @property @@ -2467,7 +2178,8 @@ def get_arg_string(self, pipeline_name, sample): pipeline_name, value, key) raise - _LOGGER.debug("Adding '{}' from attribute '{}' for argument '{}'".format(arg, value, key)) + _LOGGER.debug("Adding '{}' from attribute '{}' for argument '{}'". + format(arg, value, key)) argstring += " " + str(key) + " " + str(arg) # Add optional arguments @@ -2568,32 +2280,31 @@ class ProtocolInterface(object): single project. Also stored are path attributes with information about the location(s) from which the PipelineInterface and ProtocolMapper came. - :param pipedir: location (e.g., code repository) of pipelines - :type pipedir: str + :param location: location (e.g., code repository) of pipelines + :type location: str """ - def __init__(self, pipedir): + def __init__(self, location): super(ProtocolInterface, self).__init__() - if _os.path.isdir(pipedir): - self.pipedir = pipedir - self.config_path = _os.path.join(pipedir, "config") - self.interface = PipelineInterface(_os.path.join( - self.config_path, "pipeline_interface.yaml")) + if _os.path.isdir(location): + self.location = location + self.interface_path = _os.path.join( + location, "config", "pipeline_interface.yaml") + self.interface = PipelineInterface(self.interface_path) self.protomap = ProtocolMapper(_os.path.join( - self.config_path, "protocol_mappings.yaml")) - self.pipelines_path = _os.path.join(pipedir, "pipelines") + location, "config", "protocol_mappings.yaml")) + self.pipelines_path = _os.path.join(location, "pipelines") - elif _os.path.isfile(pipedir): + elif _os.path.isfile(location): # Secondary version that passes combined yaml file directly, # instead of relying on separate hard-coded config names as above - self.pipedir = None - self.interface_file = pipedir - - self.pipelines_path = _os.path.dirname(pipedir) + self.location = None + self.interface_path = location + self.pipelines_path = _os.path.dirname(location) - with open(self.interface_file, 'r') as interface_file: + with open(location, 'r') as interface_file: iface = yaml.load(interface_file) try: if "protocol_mapping" in iface: @@ -2612,11 +2323,11 @@ def __init__(self, pipedir): else: raise ValueError("Alleged pipelines location '{}' exists neither " - "as a file nor as a folder.".format(pipedir)) + "as a file nor as a folder.".format(location)) def __repr__(self): - return repr(self.__dict__) + return "ProtocolInterface from '{}'".format(self.location) def pipeline_key_to_path(self, pipeline_key): @@ -2670,13 +2381,33 @@ class ProtocolMapper(Mapping): def __init__(self, mappings_input): if isinstance(mappings_input, Mapping): mappings = mappings_input + self.filepath = None else: # Parse file mapping protocols to pipeline(s). with open(mappings_input, 'r') as mapfile: mappings = yaml.load(mapfile) + self.filepath = mappings_input self.mappings = {k.upper(): v for k, v in mappings.items()} + def __getitem__(self, protocol_name): + return self.mappings[protocol_name] + + def __iter__(self): + return iter(self.mappings) + + def __len__(self): + return len(self.mappings) + + + def __repr__(self): + source = self.filepath or "mapping" + num_protocols = len(self.mappings) + protocols = ", ".join(self.mappings.keys()) + return "{} from {}, with {} protocol(s): {}".format( + self.__class__.__name__, source, num_protocols, protocols) + + def build_pipeline(self, protocol): """ Create command-line text for given protocol's pipeline(s). @@ -2718,19 +2449,6 @@ def register_job(self, job, dep): _LOGGER.info("Register Job Name: %s\tDep: %s", str(job), str(dep)) - def __getitem__(self, protocol_name): - return self.mappings[protocol_name] - - def __iter__(self): - return iter(self.mappings) - - def __len__(self): - return len(self.mappings) - - def __repr__(self): - return repr(self.__dict__) - - class _InvalidResourceSpecificationException(Exception): """ Pipeline interface resources--if present--needs default. """ diff --git a/looper/utils.py b/looper/utils.py index 4fe3c241..3db40f1c 100644 --- a/looper/utils.py +++ b/looper/utils.py @@ -125,9 +125,15 @@ def import_from_source(name, module_filepath): :param str module_filepath: path to the file that constitutes the module to import :return module: module imported from the given location, named as indicated + :raises ValueError: if path provided does not point to an extant file + :raises ImportError: if path provided is indeed an existing file, but the """ import sys + if not os.path.exists(module_filepath): + raise ValueError("Path to alleged module file doesn't point to an " + "extant file: '{}'".format(module_filepath)) + if sys.version_info >= (3, 5): from importlib import util as _il_util modspec = _il_util.spec_from_file_module_filepath( @@ -139,8 +145,9 @@ def import_from_source(name, module_filepath): mod = imp.load_source(name, module_filepath) else: # 3.3 or 3.4 - from importlib import machinery - mod = machinery.SourceFileLoader(name, module_filepath) + from importlib import machinery as _il_mach + loader = _il_mach.SourceFileLoader(name, module_filepath) + mod = loader.load_module() return mod From bc3edfdda61fc9e07816aade25169be4471f55e5 Mon Sep 17 00:00:00 2001 From: Vince Reuter Date: Mon, 19 Jun 2017 16:58:15 -0400 Subject: [PATCH 35/94] fix establishment of default compute and environment settings --- looper/models.py | 46 +++++++++++++++++++++++----------------------- 1 file changed, 23 insertions(+), 23 deletions(-) diff --git a/looper/models.py b/looper/models.py index 2538db99..a86942fd 100644 --- a/looper/models.py +++ b/looper/models.py @@ -529,7 +529,8 @@ def __init__(self, config_file, subproject=None, self.environment, self.environment_file = None, None try: - self.update_environment(default_compute) + self.update_environment( + default_compute or self.default_compute_envfile) except Exception as e: _LOGGER.error("Can't load environment config file '%s'", str(default_compute)) @@ -1148,27 +1149,26 @@ def set_compute(self, setting): # Hope that environment & environment compute are present. if setting and self.environment and "compute" in self.environment: - - # Augment compute, creating it if needed - if self.compute is None: - _LOGGER.debug("Creating Project compute") - self.compute = AttributeDict() - _LOGGER.debug("Adding entries for setting '%s'", setting) - self.compute.add_entries(self.environment.compute[setting]) - - # Ensure submission template is absolute. - if not _os.path.isabs(self.compute.submission_template): - try: - self.compute.submission_template = _os.path.join( - _os.path.dirname(self.environment_file), - self.compute.submission_template) - except AttributeError as e: - # Environment and environment compute should at least have been - # set as null-valued attributes, so execution here is an error. - _LOGGER.error(str(e)) - # Compute settings have been established. - else: - return True + # Augment compute, creating it if needed. + if self.compute is None: + _LOGGER.debug("Creating Project compute") + self.compute = AttributeDict() + _LOGGER.debug("Adding entries for setting '%s'", setting) + self.compute.add_entries(self.environment.compute[setting]) + + # Ensure submission template is absolute. + if not _os.path.isabs(self.compute.submission_template): + try: + self.compute.submission_template = _os.path.join( + _os.path.dirname(self.environment_file), + self.compute.submission_template) + except AttributeError as e: + # Environment and environment compute should at least have been + # set as null-valued attributes, so execution here is an error. + _LOGGER.error(str(e)) + # Compute settings have been established. + else: + return True else: # Scenario in which environment and environment compute are # both present but don't evaluate to True is fairly @@ -1198,7 +1198,7 @@ def update_environment(self, env_settings_file): new environment configuration data """ - with open(env_settings_file or self.default_compute_envfile, 'r') as f: + with open(env_settings_file, 'r') as f: _LOGGER.info("Loading %s: %s", self.compute_env_var, env_settings_file) env_settings = yaml.load(f) From 59121db62fc6d59a63bee179fc08adc3b0a34b66 Mon Sep 17 00:00:00 2001 From: nsheff Date: Mon, 19 Jun 2017 17:52:44 -0400 Subject: [PATCH 36/94] dot to underscore --- looper/models.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/looper/models.py b/looper/models.py index a86942fd..3b863052 100644 --- a/looper/models.py +++ b/looper/models.py @@ -1853,7 +1853,7 @@ def to_yaml(self, path=None, subs_folder_path=None, pipeline_name=None): "To represent {} on disk, provide a full path or a path " "to a parent (submissions) folder". format(self.__class__.__name__)) - filename = "{}.{}.yaml".format(self.sample_name, pipeline_name) \ + filename = "{}_{}.yaml".format(self.sample_name, pipeline_name) \ if pipeline_name else "{}.yaml".format(self.sample_name) path = _os.path.join(subs_folder_path, filename) self.yaml_file = path From d1c18c4a58670fea66e72e9656769a7e29a09701 Mon Sep 17 00:00:00 2001 From: Vince Reuter Date: Mon, 19 Jun 2017 18:06:07 -0400 Subject: [PATCH 37/94] message format --- looper/looper.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/looper/looper.py b/looper/looper.py index 83e8a367..3916d3a7 100755 --- a/looper/looper.py +++ b/looper/looper.py @@ -222,8 +222,8 @@ def run(prj, args, remaining_args): sample.read_type = re.sub( '[_\\-]?end$', '', str(sample.read_type)).lower() if sample.read_type not in valid_read_types: - skip_reasons.append("{} must be in {}".\ - format("read_type", valid_read_types)) + skip_reasons.append( + "read_type must be in {}".format(valid_read_types)) # Get the base protocol-to-pipeline mappings try: From ce4e9bf05001d3dbb6ec6998b764dc7b012a0656 Mon Sep 17 00:00:00 2001 From: Vince Reuter Date: Mon, 19 Jun 2017 22:54:01 -0400 Subject: [PATCH 38/94] remove punctuation and match case for protocol search; ask Sample about dormancy; better messaging --- looper/__init__.py | 2 +- looper/looper.py | 36 +++++++++++++++++++----------------- looper/models.py | 29 ++++++++++++++++++++++++++--- 3 files changed, 46 insertions(+), 21 deletions(-) diff --git a/looper/__init__.py b/looper/__init__.py index 11dcf312..8be0cf76 100644 --- a/looper/__init__.py +++ b/looper/__init__.py @@ -19,7 +19,7 @@ # Default user logging format is simple DEFAULT_LOGGING_FMT = "%(message)s" # Developer logger format is more information-rich -DEV_LOGGING_FMT = "%(module)s:%(lineno)d [%(levelname)s] > %(message)s " +DEV_LOGGING_FMT = "%(module)s:%(lineno)d (%(funcName)s) [%(levelname)s] > %(message)s " diff --git a/looper/looper.py b/looper/looper.py index 3916d3a7..0bd103e3 100755 --- a/looper/looper.py +++ b/looper/looper.py @@ -14,8 +14,8 @@ import pandas as _pd from . import setup_looper_logger, LOGGING_LEVEL, __version__ from .loodels import Project -from .models import Sample, COMPUTE_SETTINGS_VARNAME -from .utils import VersionInHelpParser +from .models import Sample, COMPUTE_SETTINGS_VARNAME, SAMPLE_EXECUTION_TOGGLE +from .utils import alpha_cased, VersionInHelpParser try: from .models import PipelineInterface, ProtocolMapper @@ -27,8 +27,6 @@ init() from colorama import Fore, Style -SAMPLE_EXECUTION_TOGGLE = "toggle" - # Descending by severity for correspondence with logic inversion. # That is, greater verbosity setting corresponds to lower logging level. _LEVEL_BY_VERBOSITY = [logging.ERROR, logging.CRITICAL, logging.WARN, @@ -195,10 +193,9 @@ def run(prj, args, remaining_args): failures = [] submission_bundle_by_protocol = \ - {p: prj.build_pipelines(p) for p in prj.protocols} + {alpha_cased(p): prj.build_pipelines(p) for p in prj.protocols} for sample in prj.samples: - _LOGGER.debug(sample) _LOGGER.info(_COUNTER.show(sample.sample_name, sample.library)) sample_output_folder = os.path.join( @@ -211,10 +208,9 @@ def run(prj, args, remaining_args): skip_reasons.append("Duplicate sample name") # Check if sample should be run. - if hasattr(sample, SAMPLE_EXECUTION_TOGGLE): - if sample[SAMPLE_EXECUTION_TOGGLE] != "1": - skip_reasons.append("Column '{}' deselected". - format(SAMPLE_EXECUTION_TOGGLE)) + if sample.is_dormant(): + skip_reasons.append("Inactive status (via {})". + format(SAMPLE_EXECUTION_TOGGLE)) # Check if single_or_paired value is recognized. if hasattr(sample, "read_type"): @@ -227,18 +223,20 @@ def run(prj, args, remaining_args): # Get the base protocol-to-pipeline mappings try: - protocol = sample.library + protocol = alpha_cased(sample.library) except AttributeError: skip_reasons.append("Missing 'library' attribute") else: protocol = protocol.upper() - _LOGGER.debug("Building pipeline(s) for protocol: '{}'". + _LOGGER.debug("Fetching pipeline(s) for protocol: '{}'". format(protocol)) try: submission_bundles = submission_bundle_by_protocol[protocol] except KeyError: skip_reasons.append( - "No pipeline found for protocol {}".format(protocol)) + "No pipeline found for protocol {}; known: {}". + format(protocol, + list(submission_bundle_by_protocol.keys()))) if skip_reasons: _LOGGER.warn("> Not submitted: {}".format(skip_reasons)) @@ -264,9 +262,9 @@ def run(prj, args, remaining_args): _LOGGER.debug("Creating %s instance for sample '%s'", sample_subtype.__name__, sample.sample_name) sample = sample_subtype(sample_data) + pipeline_name, _ = os.path.splitext(pipeline_key) if sample_subtype != Sample: # Only rewrite the file if we have a proper subtype. - pipeline_name, _ = os.path.splitext(pipeline_key) _LOGGER.debug("Representing sample '%s' on disk as %s", sample.sample_name, sample_subtype.__name__) sample.to_yaml(subs_folder_path=prj.metadata.submission_subdir, @@ -379,6 +377,8 @@ def run(prj, args, remaining_args): # Submit job! job_count += 1 + _LOGGER.debug("Attempting job submission: '%s' ('%s')", + sample.sample_name, pipeline_name) submitted = cluster_submit( sample, prj.compute.submission_template, prj.compute.submission_command, submit_settings, @@ -387,9 +387,12 @@ def run(prj, args, remaining_args): dry_run=args.dry_run, ignore_flags=args.ignore_flags, remaining_args=remaining_args) if submitted: + _LOGGER.debug("SUCCESS: submitted") submit_count += 1 + else: + _LOGGER.debug("FAILURE: not submitted") - msg = "\nLooper finished. {} of {} job(s) submitted.".\ + msg = "Looper finished. {} of {} job(s) submitted.".\ format(submit_count, job_count) if args.dry_run: msg += " Dry run. No jobs were actually submitted." @@ -678,8 +681,7 @@ def cluster_submit( flag_files = glob.glob(os.path.join( sample_output_folder, pipeline_name + "*.flag")) if len(flag_files) > 0: - flags = [os.path.basename(f) for f in flag_files] - _LOGGER.info("> Not submitting, flag(s) found: {}".format(flags)) + _LOGGER.info("> Not submitting, flag(s) found: {}".format(flag_files)) submit = False else: pass diff --git a/looper/models.py b/looper/models.py index 3b863052..df45ce61 100644 --- a/looper/models.py +++ b/looper/models.py @@ -77,6 +77,7 @@ SAMPLE_ANNOTATIONS_KEY = "sample_annotation" IMPLICATIONS_DECLARATION = "implied_columns" DATA_SOURCES_SECTION = "data_sources" +SAMPLE_EXECUTION_TOGGLE = "toggle" COL_KEY_SUFFIX = "_key" ATTRDICT_METADATA = {"_force_nulls": False, "_attribute_identity": False} @@ -1449,6 +1450,26 @@ def confirm_required_inputs(self, permissive=False): return True + def is_dormant(self): + """ + Determine whether this Sample is inactive. + + By default, a Sample is regarded as active. That is, if it lacks an + indication about activation status, it's assumed to be active. If, + however, and there's an indication of such status, it must be '1' + in order to be considered switched 'on.' + + :return bool: whether this Sample's been designated as dormant + """ + try: + flag = self[SAMPLE_EXECUTION_TOGGLE] + except KeyError: + # Regard default Sample state as active. + return False + # If specified, the activation flag must be set to '1'. + return flag != "1" + + def generate_name(self): """ Generate name for the sample by joining some of its attribute strings. @@ -1508,7 +1529,7 @@ def infer_columns(self, implications): """ _LOGGER.debug( - "Sample variable(s) that can imply others: %s", str(implications)) + "Sample attribute implications: {}".format(implications)) if not implications: return @@ -1892,10 +1913,12 @@ def obj2dict(obj, else: return obj - _LOGGER.debug("Converting Sample '%s' to dictionary", self.name) + _LOGGER.debug("Serializing %s: '%s'", + self.__class__.__name__, self.name) serial = obj2dict(self) with open(self.yaml_file, 'w') as outfile: - _LOGGER.debug("Converting '%s' dictionary to YAML data", self.name) + _LOGGER.debug("Generating YAML data for %s: '%s'", + self.__class__.__name__, self.name) yaml_data = yaml.safe_dump(serial, default_flow_style=False) #yaml_data = yaml.dump(serial, Dumper=PepYamlDumper, default_flow_style=False) outfile.write(yaml_data) From bd83e5702acec7d333fcb10bf1732f62f56921cb Mon Sep 17 00:00:00 2001 From: Vince Reuter Date: Mon, 19 Jun 2017 23:01:53 -0400 Subject: [PATCH 39/94] check for read_type after pipeline attributes have been set; write Sample to disk per-pipeline; message about NGS inputs --- looper/looper.py | 21 +++++++++------------ looper/models.py | 3 +++ 2 files changed, 12 insertions(+), 12 deletions(-) diff --git a/looper/looper.py b/looper/looper.py index 0bd103e3..2635e509 100755 --- a/looper/looper.py +++ b/looper/looper.py @@ -212,15 +212,6 @@ def run(prj, args, remaining_args): skip_reasons.append("Inactive status (via {})". format(SAMPLE_EXECUTION_TOGGLE)) - # Check if single_or_paired value is recognized. - if hasattr(sample, "read_type"): - # Drop "-end", "_end", or just "end" from end of the column value. - sample.read_type = re.sub( - '[_\\-]?end$', '', str(sample.read_type)).lower() - if sample.read_type not in valid_read_types: - skip_reasons.append( - "read_type must be in {}".format(valid_read_types)) - # Get the base protocol-to-pipeline mappings try: protocol = alpha_cased(sample.library) @@ -246,9 +237,6 @@ def run(prj, args, remaining_args): # TODO: determine what to do with subtype(s) here. # Processing preconditions have been met. processed_samples.add(sample.sample_name) - _LOGGER.debug("Writing base Sample representation to disk: '%s'", - sample.sample_name) - sample.to_yaml(subs_folder_path=prj.metadata.submission_subdir) sample_data = sample.as_series().to_dict() # Go through all pipelines to submit for this protocol. @@ -296,6 +284,15 @@ def run(prj, args, remaining_args): _LOGGER.warn("> Not submitted: %s", fail_message) skip_reasons.append(fail_message) + # Check if single_or_paired value is recognized. + if hasattr(sample, "read_type"): + # Drop "-end", "_end", or just "end" from end of the column value. + sample.read_type = re.sub( + '[_\\-]?end$', '', str(sample.read_type)).lower() + if sample.read_type not in valid_read_types: + skip_reasons.append( + "read_type must be in {}".format(valid_read_types)) + # Identify cluster resources required for this submission. submit_settings = pipeline_interface.choose_resource_package( pipeline_key, sample.input_file_size) diff --git a/looper/models.py b/looper/models.py index df45ce61..78657219 100644 --- a/looper/models.py +++ b/looper/models.py @@ -1735,10 +1735,13 @@ def set_pipeline_attributes( pipeline_name, "all_input_files") if self.ngs_inputs_attr: + _LOGGER.debug("Handling NGS input attributes: '%s'", self.name) # NGS data inputs exit, so we can add attributes like # read_type, read_length, paired. self.ngs_inputs = self.get_attr_values("ngs_inputs_attr") self.set_read_type(permissive=permissive) + else: + _LOGGER.debug("No NGS inputs: '%s'", self.name) # input_size if not self.all_inputs_attr: From 9a8323ba69b7bb9f5452dbab59758a9615993004 Mon Sep 17 00:00:00 2001 From: Vince Reuter Date: Tue, 20 Jun 2017 10:15:37 -0400 Subject: [PATCH 40/94] base Sample construction with Project by default --- looper/models.py | 52 +++++++++++++++++++++++++++++++++++------------- 1 file changed, 38 insertions(+), 14 deletions(-) diff --git a/looper/models.py b/looper/models.py index 78657219..53086031 100644 --- a/looper/models.py +++ b/looper/models.py @@ -379,13 +379,13 @@ def process_pipeline_interfaces(pipeline_interface_locations): -def merge_sample(data_sources, sample, merge_table, derived_columns): +def merge_sample(sample, merge_table, data_sources, derived_columns): """ Use merge table data to augment/modify Sample. - :param Mapping data_sources: collection of named paths to data locations :param Sample sample: sample to modify via merge table data :param merge_table: data with which to alter Sample + :param Mapping data_sources: collection of named paths to data locations :param derived_columns: names of columns with data-derived value :return Sample: updated input instance """ @@ -501,6 +501,10 @@ class Project(AttributeDict): settings can't be established, optional; if null (the default), a warning message will be logged, and no exception will be raised. :type no_compute_exception: type + :param defer_sample_construction: whether to wait to build this Project's + Sample objects until they're needed, optional; by default, the basic + Sample is created during Project construction + :type defer_sample_construction: bool :Example: @@ -518,7 +522,8 @@ class Project(AttributeDict): def __init__(self, config_file, subproject=None, default_compute=None, dry=False, permissive=True, file_checks=False, compute_env_file=None, - no_environment_exception=None, no_compute_exception=None): + no_environment_exception=None, no_compute_exception=None, + defer_sample_construction=False): _LOGGER.debug("Creating %s from file: '%s'", self.__class__.__name__, config_file) @@ -606,6 +611,7 @@ def __init__(self, config_file, subproject=None, process_pipeline_interfaces(self.metadata.pipelines_dir) self.sheet = check_sheet(self.metadata.sample_annotation) self.merge_table = None + self._samples = None if defer_sample_construction else self.samples def __repr__(self): @@ -713,9 +719,16 @@ def samples(self): :return generator[Sample]: Sample instance for each of this Project's samples """ - # TODO: account for merge table; store or re-merge every time? - # TODO: is it more likely to have a bunch of samples, or that - # TODO: use of this and thus the need to re-merge is very frequent? + if hasattr(self, "_samples") and self._samples is not None: + _LOGGER.debug("%s has %d basic Sample(s)", + len(self._samples), self.__class__.__name__) + return self._samples + else: + _LOGGER.debug("Building basic Sample(s) for %s", + self.__class__.__name__) + + # This should be executed just once, establishing the Project's + # base Sample objects if they don't already exist. if hasattr(self.metadata, "merge_table"): if self.merge_table is None: if _os.path.isfile(self.metadata.merge_table): @@ -723,20 +736,25 @@ def samples(self): self.metadata.merge_table, sep=None, engine="python") else: - _LOGGER.debug("Alleged path to merge table data is not " - "a file: '%s'", self.metadata.merge_table) + _LOGGER.debug( + "Alleged path to merge table data is not a " + "file: '%s'", self.metadata.merge_table) else: _LOGGER.debug("Already parsed merge table") else: _LOGGER.debug("No merge table") + # Define merge behavior based on presence of merge table. if self.merge_table is None: def merge(s): return s else: def merge(s): - return merge_sample(s, self.merge_table, self.derived_columns) + return merge_sample(s, self.merge_table, self.data_sources, + self.derived_columns) + # Create the Sample(s). + samples = [] for _, row in self.sheet.iterrows(): sample = Sample(row.dropna()) sample.set_genome(self.genomes) @@ -751,7 +769,10 @@ def merge(s): _LOGGER.debug("Sample '%s' lacks data source --> skipping " "data path assignment", sample.sample_name) sample = merge(sample) - yield sample + samples.append(sample) + + self._samples = samples + return self._samples @property @@ -1700,7 +1721,7 @@ def _set_assembly(self, ome, assemblies): except KeyError: _LOGGER.debug("Unknown {} value: '{}'".format(ome, self.organism)) assembly = None - _LOGGER.debug("Setting {} as {} on sample '{}'". + _LOGGER.debug("Setting {} as {} on sample: '{}'". format(assembly, ome, self.name)) setattr(self, ome, assembly) @@ -1984,16 +2005,19 @@ def file_has_pattern(pattern, filepath): _, modname = _os.path.split(pipeline_filepath) modname, _ = _os.path.splitext(modname) try: + _LOGGER.debug("Attempting to import module defined by {}, " + "calling it {}".format(pipeline_filepath, modname)) pipeline_module = import_from_source( name=modname, module_filepath=pipeline_filepath) except ImportError as e: _LOGGER.warn("Using base Sample because of failure in attempt to " "import pipeline module: {}".format(e)) return cls - - _LOGGER.debug("Successfully imported pipeline module '%s', " - "naming it '%s'", pipeline_filepath, + else: + _LOGGER.debug("Successfully imported pipeline module '%s', " + "naming it '%s'", pipeline_filepath, pipeline_module.__name__) + import inspect sample_subtypes = inspect.getmembers( pipeline_module, lambda obj: isinstance(obj, Sample)) From 4d725d404204759e08bb1a2395e6f5e03abff210 Mon Sep 17 00:00:00 2001 From: Vince Reuter Date: Tue, 20 Jun 2017 11:54:22 -0400 Subject: [PATCH 41/94] first pass at new subtype import logic --- looper/models.py | 153 +++++++++++++++++++++++++++++++++++++++-------- 1 file changed, 129 insertions(+), 24 deletions(-) diff --git a/looper/models.py b/looper/models.py index 53086031..a7865851 100644 --- a/looper/models.py +++ b/looper/models.py @@ -352,10 +352,10 @@ def process_pipeline_interfaces(pipeline_interface_locations): :param Iterable[str] pipeline_interface_locations: locations, each of which should be either a directory path or a filepath, that specifies - pipeline interace and protocol mappings information. Each such file + pipeline interface and protocol mappings information. Each such file should be have a pipelines section and a protocol mappings section whereas each folder should have a file for each of those sections. - :return Mapping[str, ProtocolInterace]: mapping from protocol name to + :return Mapping[str, ProtocolInterface]: mapping from protocol name to interface(s) for which that protocol is mapped """ ifproto_by_proto_name = defaultdict(list) @@ -818,14 +818,6 @@ def build_pipelines(self, protocol, priority=True): those already mapped and those not yet mapped """ - # TODO: called from looper.run; do the import and subclass search here - # TODO: for the search, use something like subprocess with grep for - # TODO: checking for if __name__ == __main__ to determine whether it - # TODO: may run. If so, warn and skip. If not, import with something - # TODO: like imp.load_source, then use the inspect logic to search - # TODO: for Sample subclass(es), using one without __library__ as - # TODO: presumptive default. Determine what to do about specificity. - # Pull out the collection of interfaces (potentially one from each of # the locations indicated in the project configuration file) as a # sort of pool of information about possible ways in which to submit @@ -853,7 +845,8 @@ def build_pipelines(self, protocol, priority=True): this_protocol_pipelines = \ proto_iface.protomap.mappings[protocol] except KeyError: - _LOGGER.debug("Protocol '%s' lacks a mapping", protocol) + _LOGGER.debug("No mapping for protocol '%s' in '%s', skipping", + protocol, proto_iface.location) continue # TODO: update once dependency-encoding logic is in place. @@ -868,11 +861,10 @@ def build_pipelines(self, protocol, priority=True): .strip(" ()\n")\ .split(",") # These cleaned pipeline keys are what's used to resolve the path - # to the pipeline to run. Essentially, each pipeline key is a - # pointer to the fully-qualified location of a pipeline + # to the pipeline to run. pipeline_keys = [pk.strip() for pk in pipeline_keys] - # Skip over pipelines already been mapped by another location. + # Skip over pipelines already mapped by another location. already_mapped, new_scripts = \ partition(pipeline_keys, partial(_is_member, items=pipeline_keys_used)) @@ -899,14 +891,15 @@ def build_pipelines(self, protocol, priority=True): format(len(new_scripts), protocol, proto_iface.location, ", ".join(new_scripts))) - new_jobs = [] + new_jobs = [proto_iface.create_submission_bundle(pipeline_key, + protocol) + for pipeline_key in new_scripts] for pipeline_key in new_scripts: strict_pipe_key, full_pipe_path, full_pipe_path_with_flags = \ proto_iface.pipeline_key_to_path(pipeline_key) - sample_subtype = Sample.select_sample_subtype( - full_pipe_path, protocol) + sample_subtype = proto_iface.select_sample_subtype(protocol) submission_bundle = SubmissionBundle( - proto_iface.interface, sample_subtype, + proto_iface.pipe_iface, sample_subtype, strict_pipe_key, full_pipe_path_with_flags) new_jobs.append(submission_bundle) @@ -2334,15 +2327,19 @@ class ProtocolInterface(object): :type location: str """ + + SUBTYPE_MAPPING_SECTION = "sample_subtypes" + + def __init__(self, location): super(ProtocolInterface, self).__init__() if _os.path.isdir(location): self.location = location - self.interface_path = _os.path.join( + self.pipe_iface_path = _os.path.join( location, "config", "pipeline_interface.yaml") - self.interface = PipelineInterface(self.interface_path) + self.pipe_iface = PipelineInterface(self.pipe_iface_path) self.protomap = ProtocolMapper(_os.path.join( location, "config", "protocol_mappings.yaml")) self.pipelines_path = _os.path.join(location, "pipelines") @@ -2351,7 +2348,7 @@ def __init__(self, location): # Secondary version that passes combined yaml file directly, # instead of relying on separate hard-coded config names as above self.location = None - self.interface_path = location + self.pipe_iface_path = location self.pipelines_path = _os.path.dirname(location) with open(location, 'r') as interface_file: @@ -2363,7 +2360,7 @@ def __init__(self, location): raise Exception("pipeline_interface file is missing " "a 'protocol_mapping' section.") if "pipelines" in iface: - self.interface = PipelineInterface(iface["pipelines"]) + self.pipe_iface = PipelineInterface(iface["pipelines"]) else: raise Exception("pipeline_interface file is missing " "a 'pipelines' section.") @@ -2380,6 +2377,54 @@ def __repr__(self): return "ProtocolInterface from '{}'".format(self.location) + def create_submission_bundle(self, pipeline_key, protocol): + """ + Create the collection of values needed to submit Sample for processing. + + :param str pipeline_key: key for specific pipeline in a pipeline + interface mapping declaration + :param str protocol: name of the relevant protocol + :return SubmissionBundle: a namedtuple with this ProtocolInterface's + PipelineInterface, the Sample subtype to use for the submission, + the pipeline (script) key, and the full pipeline path with + command-line flags + """ + + subtype = None + + strict_pipe_key, full_pipe_path, full_pipe_path_with_flags = \ + self.pipeline_key_to_path(pipeline_key) + this_pipeline_data = self.pipe_iface[strict_pipe_key] + + try: + subtypes = this_pipeline_data[self.SUBTYPE_MAPPING_SECTION] + except KeyError: + _LOGGER.debug("%s from '%s' doesn't define section '%s'", + self.pipe_iface.__class__.__name__, + self.location, self.SUBTYPE_MAPPING_SECTION) + subtype = Sample + else: + if isinstance(subtypes, str): + subtype_name = subtypes + _LOGGER.debug("Single subtype name for pipeline '%s' " + "in interface from '%s': '%s'", subtype_name, + strict_pipe_key, self.location) + else: + try: + subtype_name = subtypes[protocol] + except KeyError: + subtype = Sample + _LOGGER.debug("No %s subtype specified for pipeline '%s' " + "in interface from '%s'", subtype.__name__, + strict_pipe_key, self.location) + + # subtype_name is defined if and only if subtype remained null. + subtype = subtype or \ + _import_sample_subtype(full_pipe_path, subtype_name) + return SubmissionBundle(self.pipe_iface, subtype, + strict_pipe_key, full_pipe_path_with_flags) + + def pipeline_key_to_path(self, pipeline_key): """ Given a pipeline_key, return the path to the script for that pipeline @@ -2396,8 +2441,8 @@ def pipeline_key_to_path(self, pipeline_key): # The strict key is the script name itself, something like "ATACseq.py" strict_pipeline_key, _, pipeline_key_args = pipeline_key.partition(' ') - if self.interface.get_attribute(strict_pipeline_key, "path"): - script_path_only = self.interface.get_attribute( + if self.pipe_iface.get_attribute(strict_pipeline_key, "path"): + script_path_only = self.pipe_iface.get_attribute( strict_pipeline_key, "path")[0].strip() script_path_with_flags = \ " ".join([script_path_only, pipeline_key_args]) @@ -2418,6 +2463,58 @@ def pipeline_key_to_path(self, pipeline_key): +def _import_sample_subtype(pipeline_filepath, subtype_name): + """ + Import a particular Sample subclass from a Python module. + + :param str pipeline_filepath: path to file to regard as Python module + :param str subtype_name: name of the target class; this must derive from + the base Sample class. + :return type: the imported class, defaulting to base Sample in case of + failure with the import or other logic + :raises _UndefinedSampleSubtypeException: if the module is imported but + type indicated by subtype_name is not found as a class + """ + base_type = Sample + + _, modname = _os.path.split(pipeline_filepath) + modname, _ = _os.path.splitext(modname) + + try: + _LOGGER.debug("Attempting to import module defined by {}, " + "calling it {}".format(pipeline_filepath, modname)) + pipeline_module = import_from_source( + name=modname, module_filepath=pipeline_filepath) + except ImportError as e: + _LOGGER.warn("Using base %s because of failure in attempt to " + "import pipeline module: %s", base_type.__name__, e) + return base_type + else: + _LOGGER.debug("Successfully imported pipeline module '%s', " + "naming it '%s'", pipeline_filepath, + pipeline_module.__name__) + + import inspect + def class_names(cs): + return ", ".join([c.__name__ for c in cs]) + + classes = inspect.getmembers( + pipeline_module, lambda obj: inspect.isclass(obj)) + _LOGGER.debug("Found %d classes: %s", len(classes), class_names(classes)) + sample_subtypes = filter(lambda c: issubclass(c, base_type), classes) + _LOGGER.debug("%d %s subtype(s): %s", len(sample_subtypes), + base_type.__name__, class_names(sample_subtypes)) + + for st in sample_subtypes: + if st.__name__ == subtype_name: + _LOGGER.debug("Successfully imported %s from '%s'", + subtype_name, pipeline_filepath) + return st + raise _UndefinedSampleSubtypeException( + subtype_name=subtype_name, pipeline_filepath=pipeline_filepath) + + + @copy class ProtocolMapper(Mapping): """ @@ -2548,5 +2645,13 @@ def __init__(self, pipeline): +class _UndefinedSampleSubtypeException(Exception): + """ Sample subtype--if declared in PipelineInterface--must be found. """ + def __init__(self, subtype_name, pipeline_filepath): + reason = "Sample subtype {} cannot be imported from '{}'".\ + format(subtype_name, pipeline_filepath) + super(_UndefinedSampleSubtypeException, self).__init__(reason) + + def _is_member(item, items): return item in items From 590751bc3106c53f361c237c4f541d8f9185ee25 Mon Sep 17 00:00:00 2001 From: Vince Reuter Date: Tue, 20 Jun 2017 14:33:12 -0400 Subject: [PATCH 42/94] getitem for PipelineInterface; use submission bundle creation function from ProtocolInterface; better messaging, specifically less cluttered DEBUG mode --- looper/looper.py | 7 ++-- looper/models.py | 86 ++++++++++++++++++++++++------------------------ looper/utils.py | 6 ++-- 3 files changed, 50 insertions(+), 49 deletions(-) diff --git a/looper/looper.py b/looper/looper.py index 2635e509..4367ba7a 100755 --- a/looper/looper.py +++ b/looper/looper.py @@ -192,6 +192,7 @@ def run(prj, args, remaining_args): # Create a problem list so we can keep track and show them at the end. failures = [] + _LOGGER.info("Building pipelines") submission_bundle_by_protocol = \ {alpha_cased(p): prj.build_pipelines(p) for p in prj.protocols} @@ -219,8 +220,8 @@ def run(prj, args, remaining_args): skip_reasons.append("Missing 'library' attribute") else: protocol = protocol.upper() - _LOGGER.debug("Fetching pipeline(s) for protocol: '{}'". - format(protocol)) + _LOGGER.debug("Protocol: '%s'", protocol) + _LOGGER.debug("Fetching submission bundle") try: submission_bundles = submission_bundle_by_protocol[protocol] except KeyError: @@ -247,7 +248,7 @@ def run(prj, args, remaining_args): # pipeline_key (previously pl_id) is no longer necessarily # script name, it's more flexible. - _LOGGER.debug("Creating %s instance for sample '%s'", + _LOGGER.debug("Creating %s instance: '%s'", sample_subtype.__name__, sample.sample_name) sample = sample_subtype(sample_data) pipeline_name, _ = os.path.splitext(pipeline_key) diff --git a/looper/models.py b/looper/models.py index a7865851..77e6ffd3 100644 --- a/looper/models.py +++ b/looper/models.py @@ -280,21 +280,21 @@ def __setitem__(self, key, value): if isinstance(value, Mapping): try: # Combine AttributeDict instances. - _LOGGER.debug("Updating key: '{}'".format(key)) + _LOGGER.log(5, "Updating key: '{}'".format(key)) self.__dict__[key].add_entries(value) except (AttributeError, KeyError): # Create new AttributeDict, replacing previous value. self.__dict__[key] = AttributeDict(value) - _LOGGER.debug("'{}' now has keys {}". + _LOGGER.log(5, "'{}' now has keys {}". format(key, self.__dict__[key].keys())) elif value is not None or \ key not in self.__dict__ or self.__dict__["_force_nulls"]: _LOGGER.log(5, "Setting '{}' to {}".format(key, value)) self.__dict__[key] = value else: - _LOGGER.debug("Not setting {k} to {v}; _force_nulls: {nulls}". - format(k=key, v=value, - nulls=self.__dict__["_force_nulls"])) + _LOGGER.log(5, "Not setting {k} to {v}; _force_nulls: {nulls}". + format(k=key, v=value, + nulls=self.__dict__["_force_nulls"])) def __getitem__(self, item): @@ -366,7 +366,7 @@ def process_pipeline_interfaces(pipeline_interface_locations): continue proto_iface = ProtocolInterface(pipe_iface_location) for proto_name in proto_iface.protomap: - _LOGGER.debug("Protocol name: {}".format(proto_name)) + _LOGGER.log(5, "Adding protocol name: '%s'", proto_name) ifproto_by_proto_name[proto_name].append(proto_iface) return ifproto_by_proto_name @@ -721,7 +721,7 @@ def samples(self): """ if hasattr(self, "_samples") and self._samples is not None: _LOGGER.debug("%s has %d basic Sample(s)", - len(self._samples), self.__class__.__name__) + self.__class__.__name__, len(self._samples)) return self._samples else: _LOGGER.debug("Building basic Sample(s) for %s", @@ -883,26 +883,17 @@ def build_pipelines(self, protocol, priority=True): pipeline_keys, pipeline_keys_used, disjoint_partition_violation) - _LOGGER.debug("Skipping {} already-mapped script names: {}". - format(len(already_mapped), - ", ".join(already_mapped))) + if len(already_mapped) > 0: + _LOGGER.debug("Skipping {} already-mapped script name(s): {}". + format(len(already_mapped), already_mapped)) _LOGGER.debug("{} new scripts for protocol {} from " - "pipelines warehouse '{}': {}". + "pipeline(s) location '{}': {}". format(len(new_scripts), protocol, - proto_iface.location, ", ".join(new_scripts))) + proto_iface.location, new_scripts)) new_jobs = [proto_iface.create_submission_bundle(pipeline_key, protocol) for pipeline_key in new_scripts] - for pipeline_key in new_scripts: - strict_pipe_key, full_pipe_path, full_pipe_path_with_flags = \ - proto_iface.pipeline_key_to_path(pipeline_key) - sample_subtype = proto_iface.select_sample_subtype(protocol) - submission_bundle = SubmissionBundle( - proto_iface.pipe_iface, sample_subtype, - strict_pipe_key, full_pipe_path_with_flags) - new_jobs.append(submission_bundle) - job_submission_bundles.append(new_jobs) # Repeat logic check of short-circuit conditional to account for @@ -1110,12 +1101,12 @@ def parse_config_file(self, subproject=None): _LOGGER.debug("Parsing relative sections") for sect in relative_sections: if not hasattr(self, sect): - _LOGGER.debug("%s lacks relative section '%s', skipping", - self.__class__.__name__, sect) + _LOGGER.log(5, "%s lacks relative section '%s', skipping", + self.__class__.__name__, sect) continue relative_vars = getattr(self, sect) if not relative_vars: - _LOGGER.debug("No relative variables, continuing") + _LOGGER.log(5, "No relative variables, continuing") continue for var in relative_vars.keys(): if not hasattr(relative_vars, var) or \ @@ -1126,18 +1117,17 @@ def parse_config_file(self, subproject=None): _LOGGER.debug("Ensuring absolute path(s) for '%s'", var) # Parsed from YAML, so small space of possible datatypes. if isinstance(relpath, list): - setattr(relative_vars, var, - [self._ensure_absolute(maybe_relpath) - for maybe_relpath in relpath]) + absolute = [self._ensure_absolute(maybe_relpath) + for maybe_relpath in relpath] else: - abs_path = self._ensure_absolute(relpath) - _LOGGER.debug("Setting '%s' to '%s'", var, abs_path) - setattr(relative_vars, var, abs_path) + absolute = self._ensure_absolute(relpath) + _LOGGER.debug("Setting '%s' to '%s'", var, absolute) + setattr(relative_vars, var, absolute) # Project config may have made compute.submission_template relative. # Make sure it's absolute. if self.compute is None: - _LOGGER.debug("No compute, no submission template") + _LOGGER.log(5, "No compute, no submission template") elif not _os.path.isabs(self.compute.submission_template): # Relative to environment config file. self.compute.submission_template = _os.path.join( @@ -1243,21 +1233,21 @@ def update_environment(self, env_settings_file): def _ensure_absolute(self, maybe_relpath): """ Ensure that a possibly relative path is absolute. """ - _LOGGER.debug("Ensuring absolute: '%s'", maybe_relpath) + _LOGGER.log(5, "Ensuring absolute: '%s'", maybe_relpath) if _os.path.isabs(maybe_relpath) or is_url(maybe_relpath): - _LOGGER.debug("Already absolute") + _LOGGER.log(5, "Already absolute") return maybe_relpath # Maybe we have env vars that make the path absolute? expanded = _os.path.expandvars(maybe_relpath) - _LOGGER.debug("Expanded: '%s'", expanded) + _LOGGER.log(5, "Expanded: '%s'", expanded) if _os.path.isabs(expanded): - _LOGGER.debug("Expanded is absolute") + _LOGGER.log(5, "Expanded is absolute") return expanded - _LOGGER.debug("Making non-absolute path '%s' be absolute", + _LOGGER.log(5, "Making non-absolute path '%s' be absolute", maybe_relpath) # Set path to an absolute path, relative to project config. config_dirpath = _os.path.dirname(self.config_file) - _LOGGER.debug("config_dirpath: %s", config_dirpath) + _LOGGER.log(5, "config_dirpath: %s", config_dirpath) abs_path = _os.path.join(config_dirpath, maybe_relpath) return abs_path @@ -1542,8 +1532,8 @@ def infer_columns(self, implications): :return None: this function mutates state and is strictly for effect """ - _LOGGER.debug( - "Sample attribute implications: {}".format(implications)) + _LOGGER.log(5, "Sample attribute implications: {}". + format(implications)) if not implications: return @@ -1712,10 +1702,11 @@ def _set_assembly(self, ome, assemblies): _LOGGER.debug("Sample '%s' lacks organism attribute", self.name) assembly = None except KeyError: - _LOGGER.debug("Unknown {} value: '{}'".format(ome, self.organism)) + _LOGGER.log(5, "Unknown {} value: '{}'". + format(ome, self.organism)) assembly = None - _LOGGER.debug("Setting {} as {} on sample: '{}'". - format(assembly, ome, self.name)) + _LOGGER.log(5, "Setting {} as {} on sample: '{}'". + format(assembly, ome, self.name)) setattr(self, ome, assembly) @@ -2085,6 +2076,14 @@ def __init__(self, config): self.pipe_iface_config = yaml.load(f) + def __getitem__(self, item): + try: + return self._select_pipeline(item) + except _MissingPipelineConfigurationException: + raise KeyError("{} is not a known pipeline; known: {}". + format(item, self.pipe_iface_config.keys())) + + def __iter__(self): return iter(self.pipe_iface_config.items()) @@ -2421,6 +2420,7 @@ def create_submission_bundle(self, pipeline_key, protocol): # subtype_name is defined if and only if subtype remained null. subtype = subtype or \ _import_sample_subtype(full_pipe_path, subtype_name) + _LOGGER.debug("Using Sample subtype: %s", subtype.__name__) return SubmissionBundle(self.pipe_iface, subtype, strict_pipe_key, full_pipe_path_with_flags) @@ -2497,7 +2497,7 @@ def _import_sample_subtype(pipeline_filepath, subtype_name): import inspect def class_names(cs): return ", ".join([c.__name__ for c in cs]) - + classes = inspect.getmembers( pipeline_module, lambda obj: inspect.isclass(obj)) _LOGGER.debug("Found %d classes: %s", len(classes), class_names(classes)) diff --git a/looper/utils.py b/looper/utils.py index 3db40f1c..ea47a36e 100644 --- a/looper/utils.py +++ b/looper/utils.py @@ -216,15 +216,15 @@ def partition(items, test): assume that the argument is not terribly large and that the function is cheap to compute and use a simpler single-pass approach. - :param collections.Iterable[object] items: items to partition + :param Sized[object] items: items to partition :param function(object) -> bool test: test to apply to each item to perform the partitioning procedure :return: list[object], list[object]: partitioned items sequences """ passes, fails = [], [] - _LOGGER.debug("Testing {} items: {}".format(len(items), items)) + _LOGGER.log(5, "Testing {} items: {}".format(len(items), items)) for item in items: - _LOGGER.debug("Testing item {}".format(item)) + _LOGGER.log(5, "Testing item {}".format(item)) group = passes if test(item) else fails group.append(item) return passes, fails From 8bb9fd9a60bb5362ce425828d040dbbd22f10404 Mon Sep 17 00:00:00 2001 From: Vince Reuter Date: Tue, 20 Jun 2017 15:18:08 -0400 Subject: [PATCH 43/94] fix up the matching of protocol names/keys in mappings; provide function for that --- looper/looper.py | 21 +++++++++++---------- looper/models.py | 29 ++++++++++++++++++++--------- 2 files changed, 31 insertions(+), 19 deletions(-) diff --git a/looper/looper.py b/looper/looper.py index 4367ba7a..b4cf9e78 100755 --- a/looper/looper.py +++ b/looper/looper.py @@ -192,9 +192,11 @@ def run(prj, args, remaining_args): # Create a problem list so we can keep track and show them at the end. failures = [] - _LOGGER.info("Building pipelines") + _LOGGER.info("Building submission bundle(s) for protocol(s): {}". + format(list(prj.protocols))) submission_bundle_by_protocol = \ - {alpha_cased(p): prj.build_pipelines(p) for p in prj.protocols} + {alpha_cased(p): prj.build_pipelines(alpha_cased(p)) + for p in prj.protocols} for sample in prj.samples: _LOGGER.info(_COUNTER.show(sample.sample_name, sample.library)) @@ -220,15 +222,14 @@ def run(prj, args, remaining_args): skip_reasons.append("Missing 'library' attribute") else: protocol = protocol.upper() - _LOGGER.debug("Protocol: '%s'", protocol) _LOGGER.debug("Fetching submission bundle") try: + _LOGGER.debug("Using '%s' as protocol key", protocol) submission_bundles = submission_bundle_by_protocol[protocol] except KeyError: - skip_reasons.append( - "No pipeline found for protocol {}; known: {}". - format(protocol, - list(submission_bundle_by_protocol.keys()))) + skip_reasons.append("No pipeline found for protocol") + if not submission_bundles: + skip_reasons.append("No submission bundle for protocol") if skip_reasons: _LOGGER.warn("> Not submitted: {}".format(skip_reasons)) @@ -403,8 +404,7 @@ def run(prj, args, remaining_args): _LOGGER.info("{} unique reasons for submission failure: {}".format( len(sample_by_reason), list(sample_by_reason.keys()))) - _LOGGER.info("Per-sample submission failure count for " - "each reason: {}".format(sample_by_reason)) + _LOGGER.info("Samples by failure: {}".format(dict(sample_by_reason))) @@ -687,7 +687,8 @@ def cluster_submit( if not submit: return False if dry_run: - _LOGGER.info("> DRY RUN: I would have submitted this") + _LOGGER.info("> DRY RUN: I would have submitted this: '%s'", + submit_script) else: subprocess.call(submission_command + " " + submit_script, shell=True) time.sleep(time_delay) # Delay next job's submission. diff --git a/looper/models.py b/looper/models.py index 77e6ffd3..8d942201 100644 --- a/looper/models.py +++ b/looper/models.py @@ -367,7 +367,7 @@ def process_pipeline_interfaces(pipeline_interface_locations): proto_iface = ProtocolInterface(pipe_iface_location) for proto_name in proto_iface.protomap: _LOGGER.log(5, "Adding protocol name: '%s'", proto_name) - ifproto_by_proto_name[proto_name].append(proto_iface) + ifproto_by_proto_name[alpha_cased(proto_name)].append(proto_iface) return ifproto_by_proto_name @@ -823,7 +823,8 @@ def build_pipelines(self, protocol, priority=True): # sort of pool of information about possible ways in which to submit # pipeline(s) for sample(s) of the indicated protocol. try: - protocol_interfaces = self.interfaces_by_protocol[protocol] + protocol_interfaces = \ + self.interfaces_by_protocol[protocol] except KeyError: _LOGGER.warn("Unknown protocol: '{}'".format(protocol)) return [] @@ -841,12 +842,10 @@ def build_pipelines(self, protocol, priority=True): if priority and len(job_submission_bundles) > 0: return job_submission_bundles[0] - try: - this_protocol_pipelines = \ - proto_iface.protomap.mappings[protocol] - except KeyError: - _LOGGER.debug("No mapping for protocol '%s' in '%s', skipping", - protocol, proto_iface.location) + this_protocol_pipelines = proto_iface.fetch(protocol) + if not this_protocol_pipelines: + _LOGGER.warn("No mapping for protocol '%s' in '%s', skipping", + protocol, proto_iface.location) continue # TODO: update once dependency-encoding logic is in place. @@ -2425,6 +2424,18 @@ def create_submission_bundle(self, pipeline_key, protocol): strict_pipe_key, full_pipe_path_with_flags) + def fetch(self, protocol): + """ + Fetch the mapping for a particular protocol, null if unmapped. + + :param str protocol: + :return str | Iterable[str] | NoneType: pipeline(s) to which the given + protocol is mapped, otherwise null + """ + return self.protomap.mappings.get(alpha_cased(protocol)) + + + def pipeline_key_to_path(self, pipeline_key): """ Given a pipeline_key, return the path to the script for that pipeline @@ -2534,7 +2545,7 @@ def __init__(self, mappings_input): with open(mappings_input, 'r') as mapfile: mappings = yaml.load(mapfile) self.filepath = mappings_input - self.mappings = {k.upper(): v for k, v in mappings.items()} + self.mappings = {alpha_cased(k): v for k, v in mappings.items()} def __getitem__(self, protocol_name): From 5a1c447d36929ed70e07f3fb45383628e419bb9c Mon Sep 17 00:00:00 2001 From: Vince Reuter Date: Tue, 20 Jun 2017 15:40:45 -0400 Subject: [PATCH 44/94] earlier pipeline locations finalization to ensure a collection; fix the looper counting --- looper/looper.py | 29 +++++++++++++++++++---------- looper/models.py | 3 ++- 2 files changed, 21 insertions(+), 11 deletions(-) diff --git a/looper/looper.py b/looper/looper.py index b4cf9e78..bfc59821 100755 --- a/looper/looper.py +++ b/looper/looper.py @@ -181,12 +181,14 @@ def run(prj, args, remaining_args): to be passed on to parser(s) elsewhere """ - _start_counter(prj.num_samples) + num_samples = prj.num_samples + _start_counter(num_samples) valid_read_types = ["single", "paired"] # Keep track of how many jobs have been submitted. - submit_count = 0 - job_count = 0 + num_job_templates = 0 # Each sample can have multiple jobs. + job_count = 0 # Some job templates will be skipped. + submit_count = 0 # Some jobs won't be submitted. processed_samples = set() # Create a problem list so we can keep track and show them at the end. @@ -244,10 +246,11 @@ def run(prj, args, remaining_args): # Go through all pipelines to submit for this protocol. # Note: control flow doesn't reach this point if variable "pipelines" # cannot be assigned (library/protocol missing). + # pipeline_key (previously pl_id) is no longer necessarily + # script name, it's more flexible. for pipeline_interface, sample_subtype, pipeline_key, pipeline_job \ in submission_bundles: - # pipeline_key (previously pl_id) is no longer necessarily - # script name, it's more flexible. + num_job_templates += 1 _LOGGER.debug("Creating %s instance: '%s'", sample_subtype.__name__, sample.sample_name) @@ -314,7 +317,8 @@ def run(prj, args, remaining_args): # Append arguments for this pipeline # Sample-level arguments are handled by the pipeline interface. try: - argstring = pipeline_interface.get_arg_string(pipeline_key, sample) + argstring = pipeline_interface.get_arg_string( + pipeline_key, sample) argstring += " " except AttributeError: # TODO: inform about which missing attribute(s). @@ -346,7 +350,8 @@ def run(prj, args, remaining_args): # because we don't care about parameters here. if hasattr(prj.pipeline_config, pipeline_key): # First priority: pipeline config in project config - pl_config_file = getattr(prj.pipeline_config, pipeline_key) + pl_config_file = getattr(prj.pipeline_config, + pipeline_key) # Make sure it's a file (it could be provided as null.) if pl_config_file: if not os.path.isfile(pl_config_file): @@ -371,7 +376,8 @@ def run(prj, args, remaining_args): "lack memory specification") # Add the command string and job name to the submit_settings object - submit_settings["JOBNAME"] = sample.sample_name + "_" + pipeline_key + submit_settings["JOBNAME"] = \ + sample.sample_name + "_" + pipeline_key submit_settings["CODE"] = cmd # Submit job! @@ -391,8 +397,11 @@ def run(prj, args, remaining_args): else: _LOGGER.debug("FAILURE: not submitted") - msg = "Looper finished. {} of {} job(s) submitted.".\ - format(submit_count, job_count) + msg = "Looper finished. {} of {} sample(s) generated job template(s); " \ + "{} of {} job template(s) were considered for submission, and " \ + "{} of those were actually submitted.".format( + len(processed_samples), num_samples, + job_count, num_job_templates, submit_count) if args.dry_run: msg += " Dry run. No jobs were actually submitted." diff --git a/looper/models.py b/looper/models.py index 8d942201..8e06f326 100644 --- a/looper/models.py +++ b/looper/models.py @@ -601,12 +601,13 @@ def __init__(self, config_file, subproject=None, except AttributeError: self.derived_columns = self.DERIVED_COLUMNS_DEFAULT + self.finalize_pipelines_directory() + # SampleSheet creation populates project's samples, adds the # sheet itself, and adds any derived columns. _LOGGER.debug("Processing {} pipeline location(s): {}". format(len(self.metadata.pipelines_dir), self.metadata.pipelines_dir)) - self.finalize_pipelines_directory() self.interfaces_by_protocol = \ process_pipeline_interfaces(self.metadata.pipelines_dir) self.sheet = check_sheet(self.metadata.sample_annotation) From be6094b753079f499d3efca69b1ea616002fb24d Mon Sep 17 00:00:00 2001 From: Vince Reuter Date: Tue, 20 Jun 2017 16:34:12 -0400 Subject: [PATCH 45/94] sheet function and better names/messages --- looper/looper.py | 6 +++--- looper/models.py | 47 ++++++++++++++++++++++++++++++++++++----------- 2 files changed, 39 insertions(+), 14 deletions(-) diff --git a/looper/looper.py b/looper/looper.py index bfc59821..f982c447 100755 --- a/looper/looper.py +++ b/looper/looper.py @@ -196,9 +196,9 @@ def run(prj, args, remaining_args): _LOGGER.info("Building submission bundle(s) for protocol(s): {}". format(list(prj.protocols))) - submission_bundle_by_protocol = \ - {alpha_cased(p): prj.build_pipelines(alpha_cased(p)) - for p in prj.protocols} + submission_bundle_by_protocol = { + alpha_cased(p): prj.build_submission_bundles( + alpha_cased(p)) for p in prj.protocols} for sample in prj.samples: _LOGGER.info(_COUNTER.show(sample.sample_name, sample.library)) diff --git a/looper/models.py b/looper/models.py index 8e06f326..976bc6dd 100644 --- a/looper/models.py +++ b/looper/models.py @@ -358,7 +358,7 @@ def process_pipeline_interfaces(pipeline_interface_locations): :return Mapping[str, ProtocolInterface]: mapping from protocol name to interface(s) for which that protocol is mapped """ - ifproto_by_proto_name = defaultdict(list) + interface_by_protocol = defaultdict(list) for pipe_iface_location in pipeline_interface_locations: if not _os.path.exists(pipe_iface_location): _LOGGER.warn("Ignoring nonexistent pipeline interface " @@ -367,8 +367,8 @@ def process_pipeline_interfaces(pipeline_interface_locations): proto_iface = ProtocolInterface(pipe_iface_location) for proto_name in proto_iface.protomap: _LOGGER.log(5, "Adding protocol name: '%s'", proto_name) - ifproto_by_proto_name[alpha_cased(proto_name)].append(proto_iface) - return ifproto_by_proto_name + interface_by_protocol[alpha_cased(proto_name)].append(proto_iface) + return interface_by_protocol @@ -717,7 +717,7 @@ def samples(self): """ Generic/base Sample instance for each of this Project's samples. - :return generator[Sample]: Sample instance for each + :return Iterable[Sample]: Sample instance for each of this Project's samples """ if hasattr(self, "_samples") and self._samples is not None: @@ -767,7 +767,7 @@ def merge(s): try: sample.data_path = sample.data_source except AttributeError: - _LOGGER.debug("Sample '%s' lacks data source --> skipping " + _LOGGER.debug("Sample '%s' lacks data source; skipping " "data path assignment", sample.sample_name) sample = merge(sample) samples.append(sample) @@ -778,7 +778,11 @@ def merge(s): @property def templates_folder(self): - """ Path to folder with default submission templates. """ + """ + Path to folder with default submission templates. + + :return str: path to folder with default submission templates + """ return _os.path.join(_os.path.dirname(__file__), "submit_templates") @@ -798,7 +802,7 @@ def infer_name(path_config_file): return config_folder - def build_pipelines(self, protocol, priority=True): + def build_submission_bundles(self, protocol, priority=True): """ Create pipelines to submit for each sample of a particular protocol. @@ -986,6 +990,30 @@ def create_argtext(name): return pipeline_argtext + def build_sheet(self, *protocols): + """ + Create all Sample object for this project for the given protocol(s). + + :return pandas.core.frame.DataFrame: DataFrame with from base version + of each of this Project's samples, for indicated protocol(s) if + given, else all of this Project's samples + """ + # Use all protocols if none are explicitly specified. + protocols = set(protocols or self.protocols) + if protocols: + protocols = set(protocols) + def include(sample): + try: + return sample.library in protocols + except AttributeError: + return False + else: + def include(_): + return True + + return _pd.DataFrame([s for s in self.samples if include(s)]) + + def make_project_dirs(self): """ Creates project directory structure if it doesn't exist. @@ -1350,12 +1378,9 @@ def __init__(self, series): self.yaml_file = None # Sample dirs - self.paths = Paths() # Only when sample is added to project, can paths be added - # This is because sample-specific files will be created in a - # data root directory dependent on the project. - # The SampleSheet object, after being added to a project, will - # call Sample.set_file_paths(). + self.paths = Paths() def __getitem__(self, item): From 7b6614adc3fbb60e83f8f5189e4bf7b4764b0dc8 Mon Sep 17 00:00:00 2001 From: Vince Reuter Date: Tue, 20 Jun 2017 16:48:49 -0400 Subject: [PATCH 46/94] docstrings and cleanup --- looper/models.py | 561 ++++++++++++++++++++--------------------------- 1 file changed, 234 insertions(+), 327 deletions(-) diff --git a/looper/models.py b/looper/models.py index 976bc6dd..7c1b72b8 100644 --- a/looper/models.py +++ b/looper/models.py @@ -88,6 +88,28 @@ +def check_sheet(sample_file, dtype=str): + """ + Check if csv file exists and has all required columns. + + :param str sample_file: path to sample annotations file. + :param type dtype: data type for CSV read. + :raises IOError: if given annotations file can't be read. + :raises ValueError: if required column(s) is/are missing. + """ + + df = _pd.read_table(sample_file, sep=None, dtype=dtype, + index_col=False, engine="python") + req = [SAMPLE_NAME_COLNAME] + missing = set(req) - set(df.columns) + if len(missing) != 0: + raise ValueError( + "Annotation sheet ('{}') is missing column(s): {}; has: {}". + format(sample_file, missing, df.columns)) + return df + + + def copy(obj): def copy(self): """ @@ -101,40 +123,153 @@ def copy(self): +def include_in_repr(attr, klazz): + """ + Determine whether to include attribute in an object's text representation. + + :param str attr: attribute to include/exclude from object's representation + :param str | type klazz: name of type or type itself of which the object + to be represented is an instance + :return bool: whether to include attribute in an object's + text representation + """ + classname = klazz.__name__ if isinstance(klazz, type) else klazz + return attr not in \ + {"Project": ["sheet", "interfaces_by_protocol"]}[classname] + + + def is_url(maybe_url): + """ + Determine whether a path is a URL. + + :param str maybe_url: path to investigate as URL + :return bool: whether path appears to be a URL + """ return urlparse(maybe_url).scheme != "" -def include_in_repr(attr, klazz): - return attr not in \ - {"Project": ["sheet", "interfaces_by_protocol"]}[klazz.__name__] +def merge_sample(sample, merge_table, data_sources, derived_columns): + """ + Use merge table data to augment/modify Sample. + :param Sample sample: sample to modify via merge table data + :param merge_table: data with which to alter Sample + :param Mapping data_sources: collection of named paths to data locations + :param derived_columns: names of columns with data-derived value + :return Sample: updated input instance + """ + if SAMPLE_NAME_COLNAME not in merge_table.columns: + raise KeyError( + "Merge table requires a column named '{}'.". + format(SAMPLE_NAME_COLNAME)) -class PepYamlRepresenter(yaml.representer.Representer): - """ Should object's YAML representation fail, get additional info. """ + sample_indexer = merge_table[SAMPLE_NAME_COLNAME] == \ + getattr(sample, SAMPLE_NAME_COLNAME) + merge_rows = merge_table[sample_indexer] - def represent_data(self, data): - """ - Supplement PyYAML's context info in case of representation failure. + if len(merge_rows) > 0: + # For each row in the merge table of this sample: + # 1) populate any derived columns + # 2) derived columns --> space-delimited strings + # 3) update the sample values with the merge table - :param object data: same as superclass - :return object: same as superclass - """ - try: - return super(PepYamlRepresenter, self).represent_data(data) - except yaml.representer.RepresenterError: - _LOGGER.error("YAML representation error: {} ({})". - format(data, type(data))) - raise + # Keep track of merged cols, + # so we don't re-derive them later. + merged_cols = { + key: "" for key in merge_rows.columns} + for _, row in merge_rows.iterrows(): + row_dict = row.to_dict() + for col in merge_rows.columns: + if col == SAMPLE_NAME_COLNAME or \ + col not in derived_columns: + continue + # Initialize key in parent dict. + col_key = col + COL_KEY_SUFFIX + merged_cols[col_key] = "" + row_dict[col_key] = row_dict[col] + row_dict[col] = sample.locate_data_source( + data_sources, col, row_dict[col], row_dict) # 1) + # Also add in any derived cols present. + for col in derived_columns: + # Skip over attributes that the sample + # either lacks, and those covered by the + # data from the current (row's) data. + if not hasattr(sample, col) or \ + col in row_dict: + continue + # Map column name key to sample's value + # for the attribute given by column name. + col_key = col + COL_KEY_SUFFIX + row_dict[col_key] = getattr(sample, col) + # Map the column name itself to the + # populated data source template string. + row_dict[col] = sample.locate_data_source( + data_sources, col, getattr(sample, col), row_dict) + _LOGGER.debug("PROBLEM adding derived column: " + "{}, {}, {}".format(col, row_dict[col], + getattr(sample, col))) -# Bespoke YAML dumper, using the custom data/object Representer. -PepYamlDumper = type("PepYamlDumper", - (yaml.emitter.Emitter, yaml.serializer.Serializer, - PepYamlRepresenter, yaml.resolver.Resolver), - dict(yaml.dumper.Dumper.__dict__)) + # Since we are now jamming multiple (merged) + # entries into a single attribute, we have to + # join them into a space-delimited string + # and then set to sample attribute. + for key, val in row_dict.items(): + if key == SAMPLE_NAME_COLNAME or not val: + continue + _LOGGER.debug("merge: sample '%s'; %s=%s", + str(sample.name), str(key), str(val)) + if not key in merged_cols: + new_val = str(val).rstrip() + else: + new_val = "{} {}".format( + merged_cols[key], str(val)).strip() + merged_cols[key] = new_val # 2) + + # Don't update sample_name. + merged_cols.pop(SAMPLE_NAME_COLNAME, None) + + sample.update(merged_cols) # 3) + sample.merged = True # mark sample as merged + sample.merged_cols = merged_cols + + return sample + + + +def process_pipeline_interfaces(pipeline_interface_locations): + """ + Create a ProtocolInterface for each pipeline location given. + + :param Iterable[str] pipeline_interface_locations: locations, each of + which should be either a directory path or a filepath, that specifies + pipeline interface and protocol mappings information. Each such file + should be have a pipelines section and a protocol mappings section + whereas each folder should have a file for each of those sections. + :return Mapping[str, ProtocolInterface]: mapping from protocol name to + interface(s) for which that protocol is mapped + """ + interface_by_protocol = defaultdict(list) + for pipe_iface_location in pipeline_interface_locations: + if not _os.path.exists(pipe_iface_location): + _LOGGER.warn("Ignoring nonexistent pipeline interface " + "location '%s'", pipe_iface_location) + continue + proto_iface = ProtocolInterface(pipe_iface_location) + for proto_name in proto_iface.protomap: + _LOGGER.log(5, "Adding protocol name: '%s'", proto_name) + interface_by_protocol[alpha_cased(proto_name)].append(proto_iface) + return interface_by_protocol + + + +# Collect PipelineInterface, Sample type, pipeline path, and script with flags. +SubmissionBundle = namedtuple( + "SubmissionBundle", + field_names=["interface", "subtype", "pipeline", "pipeline_with_flags"]) @@ -346,129 +481,6 @@ def __repr__(self): -def process_pipeline_interfaces(pipeline_interface_locations): - """ - Create a ProtocolInterface for each pipeline location given. - - :param Iterable[str] pipeline_interface_locations: locations, each of - which should be either a directory path or a filepath, that specifies - pipeline interface and protocol mappings information. Each such file - should be have a pipelines section and a protocol mappings section - whereas each folder should have a file for each of those sections. - :return Mapping[str, ProtocolInterface]: mapping from protocol name to - interface(s) for which that protocol is mapped - """ - interface_by_protocol = defaultdict(list) - for pipe_iface_location in pipeline_interface_locations: - if not _os.path.exists(pipe_iface_location): - _LOGGER.warn("Ignoring nonexistent pipeline interface " - "location '%s'", pipe_iface_location) - continue - proto_iface = ProtocolInterface(pipe_iface_location) - for proto_name in proto_iface.protomap: - _LOGGER.log(5, "Adding protocol name: '%s'", proto_name) - interface_by_protocol[alpha_cased(proto_name)].append(proto_iface) - return interface_by_protocol - - - -# Collect PipelineInterface, Sample type, pipeline path, and script with flags. -SubmissionBundle = namedtuple( - "SubmissionBundle", - field_names=["interface", "subtype", "pipeline", "pipeline_with_flags"]) - - - -def merge_sample(sample, merge_table, data_sources, derived_columns): - """ - Use merge table data to augment/modify Sample. - - :param Sample sample: sample to modify via merge table data - :param merge_table: data with which to alter Sample - :param Mapping data_sources: collection of named paths to data locations - :param derived_columns: names of columns with data-derived value - :return Sample: updated input instance - """ - - if SAMPLE_NAME_COLNAME not in merge_table.columns: - raise KeyError( - "Merge table requires a column named '{}'.". - format(SAMPLE_NAME_COLNAME)) - - sample_indexer = merge_table[SAMPLE_NAME_COLNAME] == \ - getattr(sample, SAMPLE_NAME_COLNAME) - merge_rows = merge_table[sample_indexer] - - if len(merge_rows) > 0: - # For each row in the merge table of this sample: - # 1) populate any derived columns - # 2) derived columns --> space-delimited strings - # 3) update the sample values with the merge table - - # Keep track of merged cols, - # so we don't re-derive them later. - merged_cols = { - key: "" for key in merge_rows.columns} - for _, row in merge_rows.iterrows(): - row_dict = row.to_dict() - for col in merge_rows.columns: - if col == SAMPLE_NAME_COLNAME or \ - col not in derived_columns: - continue - # Initialize key in parent dict. - col_key = col + COL_KEY_SUFFIX - merged_cols[col_key] = "" - row_dict[col_key] = row_dict[col] - row_dict[col] = sample.locate_data_source( - data_sources, col, row_dict[col], row_dict) # 1) - - # Also add in any derived cols present. - for col in derived_columns: - # Skip over attributes that the sample - # either lacks, and those covered by the - # data from the current (row's) data. - if not hasattr(sample, col) or \ - col in row_dict: - continue - # Map column name key to sample's value - # for the attribute given by column name. - col_key = col + COL_KEY_SUFFIX - row_dict[col_key] = getattr(sample, col) - # Map the column name itself to the - # populated data source template string. - row_dict[col] = sample.locate_data_source( - data_sources, col, getattr(sample, col), row_dict) - _LOGGER.debug("PROBLEM adding derived column: " - "{}, {}, {}".format(col, row_dict[col], - getattr(sample, col))) - - # Since we are now jamming multiple (merged) - # entries into a single attribute, we have to - # join them into a space-delimited string - # and then set to sample attribute. - for key, val in row_dict.items(): - if key == SAMPLE_NAME_COLNAME or not val: - continue - _LOGGER.debug("merge: sample '%s'; %s=%s", - str(sample.name), str(key), str(val)) - if not key in merged_cols: - new_val = str(val).rstrip() - else: - new_val = "{} {}".format( - merged_cols[key], str(val)).strip() - merged_cols[key] = new_val # 2) - - # Don't update sample_name. - merged_cols.pop(SAMPLE_NAME_COLNAME, None) - - sample.update(merged_cols) # 3) - sample.merged = True # mark sample as merged - sample.merged_cols = merged_cols - - return sample - - - @copy class Project(AttributeDict): """ @@ -1296,28 +1308,6 @@ def _handle_missing_env_attrs(self, env_settings_file, when_missing): -def check_sheet(sample_file, dtype=str): - """ - Check if csv file exists and has all required columns. - - :param str sample_file: path to sample annotations file. - :param type dtype: data type for CSV read. - :raises IOError: if given annotations file can't be read. - :raises ValueError: if required column(s) is/are missing. - """ - - df = _pd.read_table(sample_file, sep=None, dtype=dtype, - index_col=False, engine="python") - req = [SAMPLE_NAME_COLNAME] - missing = set(req) - set(df.columns) - if len(missing) != 0: - raise ValueError( - "Annotation sheet ('{}') is missing column(s): {}; has: {}". - format(sample_file, missing, df.columns)) - return df - - - @copy class Sample(object): """ @@ -1713,10 +1703,21 @@ def set_file_paths(self, project): def set_genome(self, genomes): + """ + Set the genome for this Sample. + + :param Mapping[str, str] genomes: genome assembly by organism name + """ self._set_assembly("genome", genomes) def set_transcriptome(self, transcriptomes): + """ + Set the transcriptome for this Sample. + + :param Mapping[str, str] transcriptomes: trascriptome assembly by + organism name + """ self._set_assembly("transcriptome", transcriptomes) @@ -1953,7 +1954,6 @@ def obj2dict(obj, _LOGGER.debug("Generating YAML data for %s: '%s'", self.__class__.__name__, self.name) yaml_data = yaml.safe_dump(serial, default_flow_style=False) - #yaml_data = yaml.dump(serial, Dumper=PepYamlDumper, default_flow_style=False) outfile.write(yaml_data) @@ -1965,113 +1965,6 @@ def update(self, newdata): setattr(self, key, value) - @classmethod - def select_sample_subtype(cls, pipeline_filepath, protocol=None): - """ - From a pipeline module, select Sample subtype for a particular protocol. - - The indicated file needs to be a Python module that can be imported. - Critically, it must be written such that importing it does not run it - as a script. That is, its workflow logic should be bundled into - function(s), or at least nested under a "if __name__ == '__main__'" - conditional. - - :param str pipeline_filepath: path to file defining a pipeline - :param str protocol: name of protocol for which to select Sample subtype - :return type: Sample type most tailored to indicated protocol and - defined within the module indicated by the given filepath, - optional; if unspecified, or if the indicated file cannot be - imported, then the base Sample type is returned. - """ - - if not _os.path.isfile(pipeline_filepath): - _LOGGER.debug("Alleged pipeline module path is not a file: '%s'", - pipeline_filepath) - return cls - - # Determine whether it appears safe to import the pipeline module, - # and return a generic, base Sample if not. - import subprocess - def file_has_pattern(pattern, filepath): - try: - with open(_os.devnull, 'w') as devnull: - return subprocess.call( - ["grep", pattern, filepath], stdout=devnull) - except Exception: - return False - safety_lines = ["if __name__ == '__main__'", - "if __name__ == \"__main__\""] - safe_to_import = \ - any(map(partial(file_has_pattern, - filepath=pipeline_filepath), - safety_lines)) - if not safe_to_import: - _LOGGER.debug("Attempt to import '{}' may run code so is refused.". - format(pipeline_filepath)) - return cls - - # Import pipeline module and find Sample subtypes. - _, modname = _os.path.split(pipeline_filepath) - modname, _ = _os.path.splitext(modname) - try: - _LOGGER.debug("Attempting to import module defined by {}, " - "calling it {}".format(pipeline_filepath, modname)) - pipeline_module = import_from_source( - name=modname, module_filepath=pipeline_filepath) - except ImportError as e: - _LOGGER.warn("Using base Sample because of failure in attempt to " - "import pipeline module: {}".format(e)) - return cls - else: - _LOGGER.debug("Successfully imported pipeline module '%s', " - "naming it '%s'", pipeline_filepath, - pipeline_module.__name__) - - import inspect - sample_subtypes = inspect.getmembers( - pipeline_module, lambda obj: isinstance(obj, Sample)) - _LOGGER.debug("%d sample subtype(s): %s", len(sample_subtypes), - ", ".join([subtype.__name__ - for subtype in sample_subtypes])) - - # Match all subtypes for null protocol; use __library__ for non-null. - if protocol is None: - _LOGGER.debug("Null protocol, matching every subtypes...") - matched_subtypes = sample_subtypes - else: - protocol_key = alpha_cased(protocol) - matched_subtypes = \ - [subtype for subtype in sample_subtypes - if protocol_key == alpha_cased(subtype.__library__)] - - # Helpful for messages about protocol name for each subtype - subtype_by_protocol_text = \ - ", ".join(["'{}' ({})".format(subtype.__library, subtype) - for subtype in sample_subtypes]) - - # Select subtype based on match count. - if 0 == len(matched_subtypes): - # Fall back to base Sample if we have no matches. - _LOGGER.debug( - "No known Sample subtype for protocol '{}' in '{}'; " - "known: {}".format(protocol, pipeline_filepath, - subtype_by_protocol_text)) - return cls - elif 1 == len(matched_subtypes): - # Use the single match if there's exactly one. - subtype = matched_subtypes[0] - _LOGGER.info("Matched protocol '{}' to Sample subtype {}". - format(protocol, subtype.__name__)) - return subtype - else: - # Throw up our hands and fall back to base Sample for multi-match. - _LOGGER.debug("Unable to choose from {} Sample subtype matches " - "for protocol '{}' in '{}': {}".format( - len(matched_subtypes), protocol, - pipeline_filepath, subtype_by_protocol_text)) - return cls - - @copy class PipelineInterface(object): @@ -2123,11 +2016,22 @@ def __repr__(self): @property def pipeline_names(self): + """ + Names of pipelines about which this interface is aware. + + :return Iterable[str]: names of pipelines about which this + interface is aware + """ return self.pipe_iface_config.keys() @property def pipelines(self): + """ + Keyed collection of pipeline interface data. + + :return Mapping: pipeline interface configuration data + """ return self.pipe_iface_config.values() @@ -2500,58 +2404,6 @@ def pipeline_key_to_path(self, pipeline_key): -def _import_sample_subtype(pipeline_filepath, subtype_name): - """ - Import a particular Sample subclass from a Python module. - - :param str pipeline_filepath: path to file to regard as Python module - :param str subtype_name: name of the target class; this must derive from - the base Sample class. - :return type: the imported class, defaulting to base Sample in case of - failure with the import or other logic - :raises _UndefinedSampleSubtypeException: if the module is imported but - type indicated by subtype_name is not found as a class - """ - base_type = Sample - - _, modname = _os.path.split(pipeline_filepath) - modname, _ = _os.path.splitext(modname) - - try: - _LOGGER.debug("Attempting to import module defined by {}, " - "calling it {}".format(pipeline_filepath, modname)) - pipeline_module = import_from_source( - name=modname, module_filepath=pipeline_filepath) - except ImportError as e: - _LOGGER.warn("Using base %s because of failure in attempt to " - "import pipeline module: %s", base_type.__name__, e) - return base_type - else: - _LOGGER.debug("Successfully imported pipeline module '%s', " - "naming it '%s'", pipeline_filepath, - pipeline_module.__name__) - - import inspect - def class_names(cs): - return ", ".join([c.__name__ for c in cs]) - - classes = inspect.getmembers( - pipeline_module, lambda obj: inspect.isclass(obj)) - _LOGGER.debug("Found %d classes: %s", len(classes), class_names(classes)) - sample_subtypes = filter(lambda c: issubclass(c, base_type), classes) - _LOGGER.debug("%d %s subtype(s): %s", len(sample_subtypes), - base_type.__name__, class_names(sample_subtypes)) - - for st in sample_subtypes: - if st.__name__ == subtype_name: - _LOGGER.debug("Successfully imported %s from '%s'", - subtype_name, pipeline_filepath) - return st - raise _UndefinedSampleSubtypeException( - subtype_name=subtype_name, pipeline_filepath=pipeline_filepath) - - - @copy class ProtocolMapper(Mapping): """ @@ -2620,6 +2472,7 @@ def build_pipeline(self, protocol): self.parse_parallel_jobs(split_jobs[i], split_jobs[i - 1]) """ + def parse_parallel_jobs(self, job, dep): job = job.replace("(", "").replace(")", "") split_jobs = [x.strip() for x in job.split(',')] @@ -2629,6 +2482,7 @@ def parse_parallel_jobs(self, job, dep): else: self.register_job(job, dep) + def register_job(self, job, dep): _LOGGER.info("Register Job Name: %s\tDep: %s", str(job), str(dep)) @@ -2690,5 +2544,58 @@ def __init__(self, subtype_name, pipeline_filepath): super(_UndefinedSampleSubtypeException, self).__init__(reason) + +def _import_sample_subtype(pipeline_filepath, subtype_name): + """ + Import a particular Sample subclass from a Python module. + + :param str pipeline_filepath: path to file to regard as Python module + :param str subtype_name: name of the target class; this must derive from + the base Sample class. + :return type: the imported class, defaulting to base Sample in case of + failure with the import or other logic + :raises _UndefinedSampleSubtypeException: if the module is imported but + type indicated by subtype_name is not found as a class + """ + base_type = Sample + + _, modname = _os.path.split(pipeline_filepath) + modname, _ = _os.path.splitext(modname) + + try: + _LOGGER.debug("Attempting to import module defined by {}, " + "calling it {}".format(pipeline_filepath, modname)) + pipeline_module = import_from_source( + name=modname, module_filepath=pipeline_filepath) + except ImportError as e: + _LOGGER.warn("Using base %s because of failure in attempt to " + "import pipeline module: %s", base_type.__name__, e) + return base_type + else: + _LOGGER.debug("Successfully imported pipeline module '%s', " + "naming it '%s'", pipeline_filepath, + pipeline_module.__name__) + + import inspect + def class_names(cs): + return ", ".join([c.__name__ for c in cs]) + + classes = inspect.getmembers( + pipeline_module, lambda obj: inspect.isclass(obj)) + _LOGGER.debug("Found %d classes: %s", len(classes), class_names(classes)) + sample_subtypes = filter(lambda c: issubclass(c, base_type), classes) + _LOGGER.debug("%d %s subtype(s): %s", len(sample_subtypes), + base_type.__name__, class_names(sample_subtypes)) + + for st in sample_subtypes: + if st.__name__ == subtype_name: + _LOGGER.debug("Successfully imported %s from '%s'", + subtype_name, pipeline_filepath) + return st + raise _UndefinedSampleSubtypeException( + subtype_name=subtype_name, pipeline_filepath=pipeline_filepath) + + + def _is_member(item, items): return item in items From b5d7fabb6a0f0cd146c8a11ff947424170c78b3a Mon Sep 17 00:00:00 2001 From: Vince Reuter Date: Tue, 20 Jun 2017 17:18:48 -0400 Subject: [PATCH 47/94] pin down what needs to change in the Sample independent tests around column inference --- looper/models.py | 2 +- tests/models/independent/test_Sample.py | 17 ++++++++--------- tests/models/independent/test_SampleSheet.py | 16 ---------------- .../test_Project_Sample_interaction.py | 2 +- 4 files changed, 10 insertions(+), 27 deletions(-) delete mode 100644 tests/models/independent/test_SampleSheet.py diff --git a/looper/models.py b/looper/models.py index 7c1b72b8..f30fddb0 100644 --- a/looper/models.py +++ b/looper/models.py @@ -1715,7 +1715,7 @@ def set_transcriptome(self, transcriptomes): """ Set the transcriptome for this Sample. - :param Mapping[str, str] transcriptomes: trascriptome assembly by + :param Mapping[str, str] transcriptomes: transcriptome assembly by organism name """ self._set_assembly("transcriptome", transcriptomes) diff --git a/tests/models/independent/test_Sample.py b/tests/models/independent/test_Sample.py index 25602a88..8f8bf3b5 100644 --- a/tests/models/independent/test_Sample.py +++ b/tests/models/independent/test_Sample.py @@ -38,12 +38,15 @@ class ParseSampleImplicationsTests: IMPLICATIONS = [SAMPLE_A_IMPLICATIONS, SAMPLE_B_IMPLICATIONS] IMPLICATIONS_MAP = {IMPLIER_NAME: IMPLICATIONS} + # TODO: now what's passed to the function is a Project instance. + # TODO: it's still the Sample itself that's responsible for USING the + # TODO: project instance passed in order to do column inference. + def test_project_lacks_implications(self, sample): """ With no implications mapping, sample is unmodified. """ before_inference = sample.__dict__ - with mock.patch.object(sample, "prj", create=True): - sample.infer_columns() + sample.infer_columns(None) after_inference = sample.__dict__ assert before_inference == after_inference @@ -51,18 +54,14 @@ def test_project_lacks_implications(self, sample): def test_empty_implications(self, sample): """ Empty implications mapping --> unmodified sample. """ before_inference = sample.__dict__ - implications = mock.MagicMock(implied_columns={}) - with mock.patch.object(sample, "prj", create=True, new=implications): - sample.infer_columns() + sample.infer_columns({}) assert before_inference == sample.__dict__ def test_null_intersection_between_sample_and_implications(self, sample): """ Sample with none of implications' fields --> no change. """ before_inference = sample.__dict__ - implications = mock.MagicMock(implied_columns=self.IMPLICATIONS_MAP) - with mock.patch.object(sample, "prj", create=True, new=implications): - sample.infer_columns() + sample.infer_columns(self.IMPLICATIONS_MAP) assert before_inference == sample.__dict__ @@ -86,7 +85,7 @@ def test_intersection_between_sample_and_implications( # Perform column inference based on mocked implications. implications = mock.MagicMock(implied_columns=self.IMPLICATIONS_MAP) with mock.patch.object(sample, "prj", create=True, new=implications): - sample.infer_columns() + sample.infer_columns(self.IMPLICATIONS_MAP) # Validate updates to sample based on column implications & inference. for implied_name, implied_value in implications.items(): diff --git a/tests/models/independent/test_SampleSheet.py b/tests/models/independent/test_SampleSheet.py deleted file mode 100644 index 27d56e78..00000000 --- a/tests/models/independent/test_SampleSheet.py +++ /dev/null @@ -1,16 +0,0 @@ -""" Tests for the SampleSheet model. """ - -import pandas as pd -import pytest -from looper.models import SampleSheet - - -__author__ = "Vince Reuter" -__email__ = "vreuter@virginia.edu" - - - -# TODO: implement a few of these. -@pytest.mark.skip("Not implemented") -class SampleSheetRoundtripTests: - pass diff --git a/tests/models/integration/test_Project_Sample_interaction.py b/tests/models/integration/test_Project_Sample_interaction.py index 0067ff82..40ebf7f3 100644 --- a/tests/models/integration/test_Project_Sample_interaction.py +++ b/tests/models/integration/test_Project_Sample_interaction.py @@ -24,7 +24,7 @@ -class ProjectSampleInteractionTests: +class SampleFolderCreationTests: """ Tests for interaction between Project and Sample. """ CONFIG_DATA_PATHS_HOOK = "uses_paths_section" From 4483f4ea4de027865f8b70ba961dfcc9c44e6b7b Mon Sep 17 00:00:00 2001 From: Vince Reuter Date: Tue, 20 Jun 2017 17:41:29 -0400 Subject: [PATCH 48/94] fix the Sample tests based on column inference method signature change --- tests/models/independent/test_Sample.py | 43 ++++++------------------- 1 file changed, 9 insertions(+), 34 deletions(-) diff --git a/tests/models/independent/test_Sample.py b/tests/models/independent/test_Sample.py index 8f8bf3b5..301f2c14 100644 --- a/tests/models/independent/test_Sample.py +++ b/tests/models/independent/test_Sample.py @@ -35,29 +35,19 @@ class ParseSampleImplicationsTests: IMPLIER_VALUES = ["a", "b"] SAMPLE_A_IMPLICATIONS = {"genome": "hg38", "phenome": "hg72"} SAMPLE_B_IMPLICATIONS = {"genome": "hg38"} - IMPLICATIONS = [SAMPLE_A_IMPLICATIONS, SAMPLE_B_IMPLICATIONS] + IMPLICATIONS = {"a": SAMPLE_A_IMPLICATIONS, "b": SAMPLE_B_IMPLICATIONS} IMPLICATIONS_MAP = {IMPLIER_NAME: IMPLICATIONS} - # TODO: now what's passed to the function is a Project instance. - # TODO: it's still the Sample itself that's responsible for USING the - # TODO: project instance passed in order to do column inference. - - def test_project_lacks_implications(self, sample): + @pytest.mark.parametrize(argnames="implications", argvalues=[None, {}, []]) + def test_project_no_implications(self, sample, implications): """ With no implications mapping, sample is unmodified. """ before_inference = sample.__dict__ - sample.infer_columns(None) + sample.infer_columns(implications) after_inference = sample.__dict__ assert before_inference == after_inference - def test_empty_implications(self, sample): - """ Empty implications mapping --> unmodified sample. """ - before_inference = sample.__dict__ - sample.infer_columns({}) - assert before_inference == sample.__dict__ - - def test_null_intersection_between_sample_and_implications(self, sample): """ Sample with none of implications' fields --> no change. """ before_inference = sample.__dict__ @@ -67,7 +57,7 @@ def test_null_intersection_between_sample_and_implications(self, sample): @pytest.mark.parametrize( argnames=["implier_value", "implications"], - argvalues=zip(IMPLIER_VALUES, IMPLICATIONS), + argvalues=IMPLICATIONS.items(), ids=lambda implier_and_implications: "implier='{}', implications={}".format( implier_and_implications[0], str(implier_and_implications[1]))) @@ -81,11 +71,7 @@ def test_intersection_between_sample_and_implications( # Set the parameterized value for the implications source field. setattr(sample, self.IMPLIER_NAME, implier_value) - - # Perform column inference based on mocked implications. - implications = mock.MagicMock(implied_columns=self.IMPLICATIONS_MAP) - with mock.patch.object(sample, "prj", create=True, new=implications): - sample.infer_columns(self.IMPLICATIONS_MAP) + sample.infer_columns(self.IMPLICATIONS_MAP) # Validate updates to sample based on column implications & inference. for implied_name, implied_value in implications.items(): @@ -95,29 +81,18 @@ def test_intersection_between_sample_and_implications( @pytest.mark.parametrize( argnames="unmapped_implier_value", argvalues=["totally-wacky-value", 62, None, np.nan]) - @pytest.mark.parametrize( - argnames="implications", argvalues=IMPLICATIONS, - ids=lambda implications: "implied={}".format(str(implications))) def test_sample_has_unmapped_value_for_implication( - self, sample, unmapped_implier_value, implications): + self, sample, unmapped_implier_value): """ Unknown value in implier field --> null inference. """ - # Negative control pre-/post-test. def no_implied_values(): assert all([not hasattr(sample, implied_field_name) - for implied_field_name in implications.keys()]) - + for implied_field_name in self.IMPLICATIONS.keys()]) no_implied_values() - - # Set the parameterized value for the implications source field. setattr(sample, self.IMPLIER_NAME, unmapped_implier_value) - - # Perform column inference based on mocked implications. - implications = mock.MagicMock(implied_columns=self.IMPLICATIONS_MAP) - with mock.patch.object(sample, "prj", create=True, new=implications): - sample.infer_columns() + sample.infer_columns(self.IMPLICATIONS_MAP) no_implied_values() From a1711170ae0f2e32781de9b0dc39707c1715494a Mon Sep 17 00:00:00 2001 From: Vince Reuter Date: Tue, 20 Jun 2017 17:49:46 -0400 Subject: [PATCH 49/94] update pipeline locations key in tests Project config data; update the patch where the check for the presence of the sample annotations sheet is done --- tests/conftest.py | 2 +- tests/models/independent/test_Project.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/tests/conftest.py b/tests/conftest.py index 51ee20f7..b05c4bdf 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -30,7 +30,7 @@ PROJECT_CONFIG_LINES = """metadata: sample_annotation: samples.csv output_dir: test - pipelines_dir: pipelines + pipeline_interfaces: pipelines merge_table: merge.csv derived_columns: [{derived_column_names}] diff --git a/tests/models/independent/test_Project.py b/tests/models/independent/test_Project.py index e28c992c..b232ce96 100644 --- a/tests/models/independent/test_Project.py +++ b/tests/models/independent/test_Project.py @@ -23,7 +23,7 @@ def project_config_data(): "metadata": { SAMPLE_ANNOTATIONS_KEY: "sample-anns-filler.csv", "output_dir": "$HOME/sequencing/output", - "pipelines_dir": "${CODE}/pipelines"}, + "pipeline_interfaces": "${CODE}/pipelines"}, "data_sources": {"arbitrary": "placeholder/data/{filename}"}, "genomes": {"human": "hg19", "mouse": "mm10"}, "transcriptomes": {"human": "hg19_cdna", "mouse": "mm10_cdna"}} @@ -489,7 +489,7 @@ def observed_argstring_elements( conf_file_path = _write_project_config(confdata, dirpath=confpath) # Subvert requirement for sample annotations file. - with mock.patch("looper.models.Project.add_sample_sheet"): + with mock.patch("looper.models.check_sheet"): project = Project(conf_file_path, default_compute=envpath) argstring = project.get_arg_string(pipeline) From 4d0f331231aaeef48f600f2404cfe06eaf9a90d7 Mon Sep 17 00:00:00 2001 From: Vince Reuter Date: Wed, 21 Jun 2017 01:06:53 -0400 Subject: [PATCH 50/94] fix handling of initial Sample creation for Project; update Project tests --- looper/models.py | 112 ++++++++++++----------- tests/models/independent/test_Project.py | 4 +- 2 files changed, 62 insertions(+), 54 deletions(-) diff --git a/looper/models.py b/looper/models.py index f30fddb0..0cbb655c 100644 --- a/looper/models.py +++ b/looper/models.py @@ -624,7 +624,8 @@ def __init__(self, config_file, subproject=None, process_pipeline_interfaces(self.metadata.pipelines_dir) self.sheet = check_sheet(self.metadata.sample_annotation) self.merge_table = None - self._samples = None if defer_sample_construction else self.samples + self._samples = None if defer_sample_construction \ + else self._make_basic_samples() def __repr__(self): @@ -732,59 +733,12 @@ def samples(self): :return Iterable[Sample]: Sample instance for each of this Project's samples """ - if hasattr(self, "_samples") and self._samples is not None: - _LOGGER.debug("%s has %d basic Sample(s)", - self.__class__.__name__, len(self._samples)) - return self._samples - else: + if self._samples is None: _LOGGER.debug("Building basic Sample(s) for %s", self.__class__.__name__) - - # This should be executed just once, establishing the Project's - # base Sample objects if they don't already exist. - if hasattr(self.metadata, "merge_table"): - if self.merge_table is None: - if _os.path.isfile(self.metadata.merge_table): - self.merge_table = _pd.read_table( - self.metadata.merge_table, - sep=None, engine="python") - else: - _LOGGER.debug( - "Alleged path to merge table data is not a " - "file: '%s'", self.metadata.merge_table) - else: - _LOGGER.debug("Already parsed merge table") - else: - _LOGGER.debug("No merge table") - - # Define merge behavior based on presence of merge table. - if self.merge_table is None: - def merge(s): - return s - else: - def merge(s): - return merge_sample(s, self.merge_table, self.data_sources, - self.derived_columns) - - # Create the Sample(s). - samples = [] - for _, row in self.sheet.iterrows(): - sample = Sample(row.dropna()) - sample.set_genome(self.genomes) - sample.set_transcriptome(self.transcriptomes) - - sample.set_file_paths(self) - # Hack for backwards-compatibility - # Pipelines should now use `data_source`) - try: - sample.data_path = sample.data_source - except AttributeError: - _LOGGER.debug("Sample '%s' lacks data source; skipping " - "data path assignment", sample.sample_name) - sample = merge(sample) - samples.append(sample) - - self._samples = samples + self._samples = self._make_basic_samples() + _LOGGER.debug("%s has %d basic Sample(s)", + self.__class__.__name__, len(self._samples)) return self._samples @@ -1043,6 +997,56 @@ def make_project_dirs(self): str(e)) + def _make_basic_samples(self): + """ Build the base Sample objects from the annotations sheet data. """ + + # This should be executed just once, establishing the Project's + # base Sample objects if they don't already exist. + if hasattr(self.metadata, "merge_table"): + if self.merge_table is None: + if _os.path.isfile(self.metadata.merge_table): + self.merge_table = _pd.read_table( + self.metadata.merge_table, + sep=None, engine="python") + else: + _LOGGER.debug( + "Alleged path to merge table data is not a " + "file: '%s'", self.metadata.merge_table) + else: + _LOGGER.debug("Already parsed merge table") + else: + _LOGGER.debug("No merge table") + + # Define merge behavior based on presence of merge table. + if self.merge_table is None: + def merge(s): + return s + else: + def merge(s): + return merge_sample(s, self.merge_table, self.data_sources, + self.derived_columns) + + # Create the Sample(s). + samples = [] + for _, row in self.sheet.iterrows(): + sample = Sample(row.dropna()) + sample.set_genome(self.get("genomes")) + sample.set_transcriptome(self.get("transcriptomes")) + + sample.set_file_paths(self) + # Hack for backwards-compatibility + # Pipelines should now use `data_source`) + try: + sample.data_path = sample.data_source + except AttributeError: + _LOGGER.debug("Sample '%s' lacks data source; skipping " + "data path assignment", sample.sample_name) + sample = merge(sample) + samples.append(sample) + + return samples + + def parse_config_file(self, subproject=None): """ Parse provided yaml config file and check required fields exist. @@ -1722,6 +1726,10 @@ def set_transcriptome(self, transcriptomes): def _set_assembly(self, ome, assemblies): + if not assemblies: + _LOGGER.debug("Empty/null assemblies mapping: {} ({})". + format(assemblies, type(assemblies))) + return try: assembly = assemblies[self.organism] except AttributeError: diff --git a/tests/models/independent/test_Project.py b/tests/models/independent/test_Project.py index b232ce96..293f7086 100644 --- a/tests/models/independent/test_Project.py +++ b/tests/models/independent/test_Project.py @@ -220,7 +220,7 @@ def _assert_null_compute_environment(project): @staticmethod def default_compute_settings(project): - settings_filepath = project.default_cmpenv_file + settings_filepath = project.default_compute_envfile with open(settings_filepath, 'r') as settings_data_file: settings = yaml.safe_load(settings_data_file) return {"environment": copy.deepcopy(settings), @@ -276,7 +276,7 @@ def create_project( # Write the config and build the Project. conf_file_path = _write_project_config( project_config_data, dirpath=dirpath) - with mock.patch("looper.models.Project.add_sample_sheet"): + with mock.patch("looper.models.check_sheet"): project = Project(conf_file_path, default_compute=default_env_path) return expected_derived_columns, project From 341230af9f3540d2a1f053fc36fd5475999f2e31 Mon Sep 17 00:00:00 2001 From: Vince Reuter Date: Wed, 21 Jun 2017 01:14:35 -0400 Subject: [PATCH 51/94] account for new attribute fetch call and remove outdated test --- looper/models.py | 3 ++- .../models/independent/test_AttributeDict.py | 24 ------------------- 2 files changed, 2 insertions(+), 25 deletions(-) diff --git a/looper/models.py b/looper/models.py index 0cbb655c..b384a432 100644 --- a/looper/models.py +++ b/looper/models.py @@ -370,7 +370,8 @@ def __getattr__(self, item, default=None): """ try: return super(AttributeDict, self).__getattribute__(item) - except AttributeError: + except (AttributeError, TypeError): + # Handle potential property and non-string failures. pass try: # Fundamentally, this is still a mapping; diff --git a/tests/models/independent/test_AttributeDict.py b/tests/models/independent/test_AttributeDict.py index 6d63430e..f1f7f5e0 100644 --- a/tests/models/independent/test_AttributeDict.py +++ b/tests/models/independent/test_AttributeDict.py @@ -562,30 +562,6 @@ def test_all_defaults_no_metadata(self, tmpdir, proj, metadata_attribute): lines, _ = self._yaml_data(sample, filepath) assert all([metadata_attribute not in line for line in lines]) - - @pytest.mark.parametrize( - argnames="metadata_attribute", argvalues=ATTRDICT_METADATA.keys(), - ids=lambda attr_name: " metadata item = {} ".format(attr_name)) - def test_non_defaults_have_metadata( - self, tmpdir, proj, metadata_attribute): - """ Only non-default metadata elements are written to file. """ - for i, sample in enumerate(proj.samples): - filepath = os.path.join(tmpdir.strpath, "sample{}.yaml".format(i)) - - # Flip the value of an attribute in the project section. - newval = not ATTRDICT_METADATA[metadata_attribute] - lines, data = self._yaml_data( - sample, filepath, section_to_change="prj", - attr_to_change=metadata_attribute, newval=newval) - - # Is the test sensitive? - assert newval == data["prj"][metadata_attribute] - # How about specific? - num_meta_lines = sum(1 if any( - [meta_item in line for meta_item - in ATTRDICT_METADATA.keys()]) else 0 for line in lines) - assert 1 == num_meta_lines - @staticmethod def _yaml_data(sample, filepath, section_to_change=None, From a35868e465bda234605438027906fa5d51339ffe Mon Sep 17 00:00:00 2001 From: Vince Reuter Date: Wed, 21 Jun 2017 01:26:22 -0400 Subject: [PATCH 52/94] ensure Sample has merged flag; remove old tests and fix others --- looper/models.py | 4 ++ tests/test_looper.py | 116 +------------------------------------------ 2 files changed, 6 insertions(+), 114 deletions(-) diff --git a/looper/models.py b/looper/models.py index b384a432..a24b7df1 100644 --- a/looper/models.py +++ b/looper/models.py @@ -235,6 +235,7 @@ def merge_sample(sample, merge_table, data_sources, derived_columns): sample.update(merged_cols) # 3) sample.merged = True # mark sample as merged sample.merged_cols = merged_cols + sample.merged = True return sample @@ -1372,6 +1373,9 @@ def __init__(self, series): self.required_paths = None self.yaml_file = None + # Not yet merged, potentially toggled when merge step is considered. + self.merged = False + # Sample dirs # Only when sample is added to project, can paths be added - # This is because sample-specific files will be created in a diff --git a/tests/test_looper.py b/tests/test_looper.py index 14b8b331..bcf63671 100644 --- a/tests/test_looper.py +++ b/tests/test_looper.py @@ -94,29 +94,13 @@ def test_unmerged_samples_lack_merged_cols(self, proj, sample_index): assert not proj.samples[sample_index].merged_cols - @pytest.mark.parametrize(argnames="sample_index", - argvalues=range(NUM_SAMPLES)) - def test_multiple_add_sample_sheet_calls_no_rederivation(self, proj, - sample_index): - """ Don't rederive `derived_columns` for multiple calls. """ - expected_files = FILE_BY_SAMPLE[sample_index] - def _observed(p): - return [os.path.basename(f) - for f in p.samples[sample_index].file.split(" ")] - assert expected_files == _observed(proj) - proj.add_sample_sheet() - proj.add_sample_sheet() - assert expected_files == _observed(proj) - proj.add_sample_sheet() - assert expected_files == _observed(proj) - - def test_duplicate_derived_columns_still_derived(self, proj): sample_index = 2 observed_nonmerged_col_basename = \ os.path.basename(proj.samples[sample_index].nonmerged_col) assert "c.txt" == observed_nonmerged_col_basename - assert "" == proj.samples[sample_index].locate_data_source('file') + assert "" == proj.samples[sample_index].locate_data_source( + proj.data_sources, 'file') @@ -223,102 +207,6 @@ def test_looper_args_usage(self, pipe_iface, pipeline, expected): -@pytest.mark.usefixtures("write_project_files") -class SampleRoundtripTests: - """ Test equality of objects written to and from YAML files. """ - - - def test_default_behavioral_metadata_retention(self, tmpdir, proj): - """ With default metadata, writing to file and restoring is OK. """ - tempfolder = str(tmpdir) - sample_tempfiles = [] - for sample in proj.samples: - path_sample_tempfile = os.path.join(tempfolder, - "{}.yaml".format(sample.name)) - sample.to_yaml(path_sample_tempfile) - sample_tempfiles.append(path_sample_tempfile) - for original_sample, temp_sample_path in zip(proj.samples, - sample_tempfiles): - with open(temp_sample_path, 'r') as sample_file: - restored_sample_data = yaml.load(sample_file) - ad = AttributeDict(restored_sample_data) - self._metadata_equality(original_sample.prj, ad) - - - def test_modified_behavioral_metadata_preservation(self, tmpdir, proj): - """ Behavior metadata modifications are preserved to/from disk. """ - tempfolder = str(tmpdir) - sample_tempfiles = [] - samples = proj.samples - assert 1 < len(samples), "Too few samples: {}".format(len(samples)) - - # TODO: note that this may fail if metadata - # modification prohibition is implemented. - samples[0].prj.__dict__["_force_nulls"] = True - samples[1].prj.__dict__["_attribute_identity"] = True - - for sample in proj.samples[:2]: - path_sample_tempfile = os.path.join(tempfolder, - "{}.yaml".format(sample.name)) - sample.to_yaml(path_sample_tempfile) - sample_tempfiles.append(path_sample_tempfile) - - with open(sample_tempfiles[0], 'r') as f: - sample_0_data = yaml.load(f) - assert AttributeDict(sample_0_data).prj._force_nulls is True - - with open(sample_tempfiles[1], 'r') as f: - sample_1_data = yaml.load(f) - sample_1_restored_attrdict = AttributeDict(sample_1_data) - assert sample_1_restored_attrdict.prj.does_not_exist == "does_not_exist" - - - def _check_nested_metadata(self, original, restored): - """ - Check equality for metadata items, accounting for nesting within - instances of AttributeDict and its child classes. - - :param AttributeDict original: original AttributeDict (or child) object - :param AttributeDict restored: instance restored from writing - original object to file, then reparsing and constructing - AttributeDict instance - :return bool: whether metadata items are equivalent between objects - at all nesting levels - """ - for key, data in original.items(): - if key not in restored: - return False - equal_level = self._metadata_equality(original, restored) - if not equal_level: - return False - if isinstance(original, AttributeDict): - return isinstance(restored, AttributeDict) and \ - self._check_nested_metadata(data, restored[key]) - else: - return True - - - @staticmethod - def _metadata_equality(original, restored): - """ - Check nested levels of metadata equality. - - :param AttributeDict original: a raw AttributeDict or an - instance of a child class that was serialized and written to disk - :param AttributeDict restored: an AttributeDict instance created by - parsing the file associated with the original object - :return bool: whether all metadata keys/items have equal value - when comparing original object to restored version - """ - for metadata_item in ATTRDICT_METADATA: - if metadata_item not in original or \ - metadata_item not in restored or \ - original[metadata_item] != restored[metadata_item]: - return False - return True - - - class RunErrorReportTests: """ Tests for aggregation of submission failures. """ From d76c70b30ed65a9e13fd4cff8535d6f7da513a47 Mon Sep 17 00:00:00 2001 From: Vince Reuter Date: Wed, 21 Jun 2017 09:54:41 -0400 Subject: [PATCH 53/94] use Series as raw Sample data in case subtypes are restrictive; grab actual class from inspect module call; control protocol naming --- looper/looper.py | 2 +- looper/models.py | 20 +++++++++++++------- 2 files changed, 14 insertions(+), 8 deletions(-) diff --git a/looper/looper.py b/looper/looper.py index f982c447..def7321d 100755 --- a/looper/looper.py +++ b/looper/looper.py @@ -241,7 +241,7 @@ def run(prj, args, remaining_args): # TODO: determine what to do with subtype(s) here. # Processing preconditions have been met. processed_samples.add(sample.sample_name) - sample_data = sample.as_series().to_dict() + sample_data = sample.as_series() # Go through all pipelines to submit for this protocol. # Note: control flow doesn't reach this point if variable "pipelines" diff --git a/looper/models.py b/looper/models.py index a24b7df1..e8c35b58 100644 --- a/looper/models.py +++ b/looper/models.py @@ -2340,9 +2340,10 @@ def create_submission_bundle(self, pipeline_key, protocol): try: subtypes = this_pipeline_data[self.SUBTYPE_MAPPING_SECTION] except KeyError: - _LOGGER.debug("%s from '%s' doesn't define section '%s'", - self.pipe_iface.__class__.__name__, - self.location, self.SUBTYPE_MAPPING_SECTION) + _LOGGER.debug("%s from '%s' doesn't define section '%s' " + "for pipeline '%s'", + self.pipe_iface.__class__.__name__, self.location, + self.SUBTYPE_MAPPING_SECTION, strict_pipe_key) subtype = Sample else: if isinstance(subtypes, str): @@ -2352,12 +2353,16 @@ def create_submission_bundle(self, pipeline_key, protocol): strict_pipe_key, self.location) else: try: - subtype_name = subtypes[protocol] + temp_subtypes = {alpha_cased(p): st + for p, st in subtypes.items()} + subtype_name = temp_subtypes[alpha_cased(protocol)] except KeyError: subtype = Sample - _LOGGER.debug("No %s subtype specified for pipeline '%s' " - "in interface from '%s'", subtype.__name__, - strict_pipe_key, self.location) + _LOGGER.debug("No %s subtype specified in interface from " + "'%s': '%s', '%s'; known: %s", + subtype.__name__, self.location, + strict_pipe_key, protocol, + ", ".join(temp_subtypes.keys())) # subtype_name is defined if and only if subtype remained null. subtype = subtype or \ @@ -2595,6 +2600,7 @@ def class_names(cs): classes = inspect.getmembers( pipeline_module, lambda obj: inspect.isclass(obj)) + classes = [klazz for _, klazz in classes] _LOGGER.debug("Found %d classes: %s", len(classes), class_names(classes)) sample_subtypes = filter(lambda c: issubclass(c, base_type), classes) _LOGGER.debug("%d %s subtype(s): %s", len(sample_subtypes), From 9f97b2a769ab9bb3862a6ea5a229a975d442e07d Mon Sep 17 00:00:00 2001 From: Vince Reuter Date: Wed, 21 Jun 2017 13:42:38 -0400 Subject: [PATCH 54/94] improve the submission counting --- looper/looper.py | 23 +++++++++-------------- 1 file changed, 9 insertions(+), 14 deletions(-) diff --git a/looper/looper.py b/looper/looper.py index def7321d..1af1ac9a 100755 --- a/looper/looper.py +++ b/looper/looper.py @@ -186,7 +186,6 @@ def run(prj, args, remaining_args): valid_read_types = ["single", "paired"] # Keep track of how many jobs have been submitted. - num_job_templates = 0 # Each sample can have multiple jobs. job_count = 0 # Some job templates will be skipped. submit_count = 0 # Some jobs won't be submitted. processed_samples = set() @@ -250,7 +249,7 @@ def run(prj, args, remaining_args): # script name, it's more flexible. for pipeline_interface, sample_subtype, pipeline_key, pipeline_job \ in submission_bundles: - num_job_templates += 1 + job_count += 1 _LOGGER.debug("Creating %s instance: '%s'", sample_subtype.__name__, sample.sample_name) @@ -381,7 +380,6 @@ def run(prj, args, remaining_args): submit_settings["CODE"] = cmd # Submit job! - job_count += 1 _LOGGER.debug("Attempting job submission: '%s' ('%s')", sample.sample_name, pipeline_name) submitted = cluster_submit( @@ -392,21 +390,18 @@ def run(prj, args, remaining_args): dry_run=args.dry_run, ignore_flags=args.ignore_flags, remaining_args=remaining_args) if submitted: - _LOGGER.debug("SUCCESS: submitted") + _LOGGER.debug("SUBMITTED") submit_count += 1 else: - _LOGGER.debug("FAILURE: not submitted") + _LOGGER.debug("NOT SUBMITTED") - msg = "Looper finished. {} of {} sample(s) generated job template(s); " \ - "{} of {} job template(s) were considered for submission, and " \ - "{} of those were actually submitted.".format( - len(processed_samples), num_samples, - job_count, num_job_templates, submit_count) + # Report what went down. + _LOGGER.info("Looper finished") + _LOGGER.info("Samples generating jobs: %d of %d", + len(processed_samples), num_samples) + _LOGGER.info("Jobs submitted: %d of %d", submit_count, job_count) if args.dry_run: - msg += " Dry run. No jobs were actually submitted." - - _LOGGER.info(msg) - + _LOGGER.info("Dry run. No jobs were actually submitted.") if failures: _LOGGER.info("%d sample(s) with submission failure.", len(failures)) sample_by_reason = aggregate_exec_skip_reasons(failures) From d2bd8071ece355b110245fc6245843c8b6df0f6f Mon Sep 17 00:00:00 2001 From: Vince Reuter Date: Wed, 21 Jun 2017 14:56:11 -0400 Subject: [PATCH 55/94] accumulate missing attributes and files in the required inputs confirmation; name changes; making this more explicit --- looper/looper.py | 20 +++++---- looper/models.py | 112 ++++++++++++++++++++++++----------------------- 2 files changed, 69 insertions(+), 63 deletions(-) diff --git a/looper/looper.py b/looper/looper.py index 1af1ac9a..96799951 100755 --- a/looper/looper.py +++ b/looper/looper.py @@ -278,15 +278,17 @@ def run(prj, args, remaining_args): _LOGGER.warn("> Not submitted: %s", fail_message) skip_reasons.append(fail_message) - try: - # Check for any required inputs before submitting. - _LOGGER.debug("Confirming required inputs") - sample.confirm_required_inputs() - except IOError: - # TODO: inform about WHICH missing file(s). - fail_message = "Required input file(s) not found" - _LOGGER.warn("> Not submitted: %s", fail_message) - skip_reasons.append(fail_message) + # Check for any missing requirements before submitting. + _LOGGER.debug("Determining missing requirements") + error_type, missing_reqs_msg = \ + sample.determine_missing_requirements() + if missing_reqs_msg: + if prj.permissive: + _LOGGER.warn(missing_reqs_msg) + else: + raise error_type(missing_reqs_msg) + _LOGGER.warn("> Not submitted: %s", missing_reqs_msg) + skip_reasons.append(missing_reqs_msg) # Check if single_or_paired value is recognized. if hasattr(sample, "read_type"): diff --git a/looper/models.py b/looper/models.py index e8c35b58..82eb1021 100644 --- a/looper/models.py +++ b/looper/models.py @@ -1423,37 +1423,45 @@ def check_valid(self, required=None): return lacking - def confirm_required_inputs(self, permissive=False): + def determine_missing_requirements(self): + """ + Determine which of this Sample's required attributes/files are missing. + + :return (type, str): hypothetical exception type along with message + about what's missing; null and empty if nothing exceptional + is detected + """ # set_pipeline_attributes must be run first. if not hasattr(self, "required_inputs"): _LOGGER.warn("You must run set_pipeline_attributes " - "before confirm_required_inputs") - return True + "before determine_missing_requirements") + return None, "" if not self.required_inputs: _LOGGER.debug("No required inputs") - return True + return None, "" # First, attributes + missing, empty = [], [] for file_attribute in self.required_inputs_attr: _LOGGER.debug("Checking '{}'".format(file_attribute)) - if not hasattr(self, file_attribute): - message = "Missing required input attribute '{}'".\ - format(file_attribute) - _LOGGER.warn(message) - if not permissive: - raise IOError(message) - else: - return False - if getattr(self, file_attribute) is "": - message = "Empty required input attribute '{}'".\ - format(file_attribute) - _LOGGER.warn(message) - if not permissive: - raise IOError(message) - else: - return False + try: + attval = getattr(self, file_attribute) + except AttributeError: + _LOGGER.debug("Missing required input attribute '%s'", + file_attribute) + missing.append(file_attribute) + continue + if attval == "": + _LOGGER.debug("Empty required input attribute '%s'", + file_attribute) + empty.append(file_attribute) + + if missing or empty: + return AttributeError, \ + "Missing attributes: {}. Empty attributes: {}".\ + format(missing, empty) # Second, files missing_files = [] @@ -1462,20 +1470,16 @@ def confirm_required_inputs(self, permissive=False): for path in paths.split(" "): _LOGGER.debug("Checking path: '{}'".format(path)) if not _os.path.exists(path): - _LOGGER.warn("Missing required input file: '{}'".format(path)) + _LOGGER.debug("Missing required input file: '{}'". + format(path)) missing_files.append(path) - if len(missing_files) > 0: - message = "Missing/unreadable file(s): {}".\ - format(", ".join(["'{}'".format(path) - for path in missing_files])) - if not permissive: - raise IOError(message) - else: - _LOGGER.error(message) - return False - - return True + if not missing_files: + return None, "" + else: + missing_message = \ + "Missing file(s): {}".format(", ".join(missing_files)) + return IOError, missing_message def is_dormant(self): @@ -1510,14 +1514,13 @@ def get_attr_values(self, attrlist): Get value corresponding to each given attribute. :param str attrlist: name of an attribute storing a list of attr names - :return list: value (or empty string) corresponding to each named attr + :return list | NoneType: value (or empty string) corresponding to + each named attribute; null if this Sample's value for the + attribute given by the argument to the "attrlist" parameter is + empty/null, or if this Sample lacks the indicated attribute """ - if not hasattr(self, attrlist): - return None - - attribute_list = getattr(self, attrlist) - # If attribute is None, then value is also None. + attribute_list = getattr(self, attrlist, None) if not attribute_list: return None @@ -1525,8 +1528,7 @@ def get_attr_values(self, attrlist): attribute_list = [attribute_list] # Strings contained here are appended later so shouldn't be null. - return [getattr(self, attr) if hasattr(self, attr) else "" - for attr in attribute_list] + return [getattr(self, attr, "") for attr in attribute_list] def get_sheet_dict(self): @@ -1775,9 +1777,11 @@ def set_pipeline_attributes( pipeline_name, "ngs_input_files") self.required_inputs_attr = pipeline_interface.get_attribute( pipeline_name, "required_input_files") + # Ensure input_size is present. self.all_inputs_attr = pipeline_interface.get_attribute( - pipeline_name, "all_input_files") + pipeline_name, "all_input_files") or self.required_inputs_attr + # Convert attribute keys into values if self.ngs_inputs_attr: _LOGGER.debug("Handling NGS input attributes: '%s'", self.name) # NGS data inputs exit, so we can add attributes like @@ -1786,14 +1790,9 @@ def set_pipeline_attributes( self.set_read_type(permissive=permissive) else: _LOGGER.debug("No NGS inputs: '%s'", self.name) - - # input_size - if not self.all_inputs_attr: - self.all_inputs_attr = self.required_inputs_attr - - # Convert attribute keys into values self.required_inputs = self.get_attr_values("required_inputs_attr") self.all_inputs = self.get_attr_values("all_inputs_attr") + self.input_file_size = get_file_size(self.all_inputs) @@ -2192,14 +2191,19 @@ def get_arg_string(self, pipeline_name, sample): return argstring - def get_attribute(self, pipeline_name, attribute_key): - """ Return value of given attribute for named pipeline. """ + def get_attribute(self, pipeline_name, attribute_key, path_as_list=True): + """ + Return value of given attribute for named pipeline. + + :param str pipeline_name: name of the pipeline of interest + :param str attribute_key: name of the attribute of interest + :param bool path_as_list: whether to ensure that a string attribute + is returned as a list; this is useful for safe iteration over + the returned value. + """ config = self._select_pipeline(pipeline_name) - try: - value = config[attribute_key] - except KeyError: - value = None - return [value] if isinstance(value, str) else value + value = config.get(attribute_key) + return [value] if isinstance(value, str) and path_as_list else value def get_pipeline_name(self, pipeline): From c7b2d118cfa69e0947a6cd3556d1928f65fd1954 Mon Sep 17 00:00:00 2001 From: Vince Reuter Date: Wed, 21 Jun 2017 15:18:36 -0400 Subject: [PATCH 56/94] update tests to reflect new function name --- tests/test_looper.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/tests/test_looper.py b/tests/test_looper.py index bcf63671..dfe8bf88 100644 --- a/tests/test_looper.py +++ b/tests/test_looper.py @@ -71,7 +71,9 @@ def test_data_sources_derivation(self, proj, sample_index): ) # Order may be lost due to mapping. # We don't care about that here, or about duplicates. - assert set(DERIVED_COLNAMES) == set(merged_columns) + expected = set(DERIVED_COLNAMES) + observed = set(merged_columns) + assert expected == observed @pytest.mark.parametrize(argnames="sample_index", @@ -123,7 +125,8 @@ def test_required_inputs(self, proj, pipe_iface, sample_index): observed_required_inputs = [os.path.basename(f) for f in sample.required_inputs] assert expected_required_inputs == observed_required_inputs - assert sample.confirm_required_inputs() + error_type, error_message = sample.determine_missing_requirements() + assert error_type is None and not error_message @pytest.mark.parametrize(argnames="sample_index", @@ -138,7 +141,8 @@ def test_ngs_pipe_ngs_sample(self, proj, pipe_iface, sample_index): [sample_index][0]) observed_required_input_basename = \ os.path.basename(sample.required_inputs[0]) - assert sample.confirm_required_inputs() + error_type, error_message = sample.determine_missing_requirements() + assert error_type is None and not error_message assert 1 == len(sample.required_inputs) assert expected_required_input_basename == \ observed_required_input_basename From aab1f54394635e59fd4099994cd3067f1bd17065 Mon Sep 17 00:00:00 2001 From: Vince Reuter Date: Wed, 21 Jun 2017 15:28:23 -0400 Subject: [PATCH 57/94] check for null merge table --- looper/models.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/looper/models.py b/looper/models.py index 82eb1021..0ee4b660 100644 --- a/looper/models.py +++ b/looper/models.py @@ -1006,10 +1006,13 @@ def _make_basic_samples(self): # base Sample objects if they don't already exist. if hasattr(self.metadata, "merge_table"): if self.merge_table is None: - if _os.path.isfile(self.metadata.merge_table): + if self.metadata.merge_table and \ + _os.path.isfile(self.metadata.merge_table): self.merge_table = _pd.read_table( self.metadata.merge_table, sep=None, engine="python") + _LOGGER.debug("Merge table shape: {}". + format(self.merge_table.shape)) else: _LOGGER.debug( "Alleged path to merge table data is not a " From ca118fa6e9b97bb8ded538e0fb76511cc968db3b Mon Sep 17 00:00:00 2001 From: Vince Reuter Date: Wed, 21 Jun 2017 19:34:06 -0400 Subject: [PATCH 58/94] prevent nulls in sample YAML; generate Sample filename from subtype --- looper/looper.py | 64 ++++++++++++++++++------------ looper/models.py | 92 +++++++++++++++++++++++++++++++++----------- tests/test_looper.py | 3 +- 3 files changed, 110 insertions(+), 49 deletions(-) diff --git a/looper/looper.py b/looper/looper.py index 96799951..4178ef7e 100755 --- a/looper/looper.py +++ b/looper/looper.py @@ -240,6 +240,14 @@ def run(prj, args, remaining_args): # TODO: determine what to do with subtype(s) here. # Processing preconditions have been met. processed_samples.add(sample.sample_name) + + # At this point, we have a generic Sample; write that to disk + # for reuse in case of many jobs (pipelines) using base Sample. + # Do a single overwrite here, then any subsequent Sample can be sure + # that the file is fresh, with respect to this run of looper. + sample.to_yaml(subs_folder_path=prj.metadata.submission_subdir) + + # Store the base Sample data for reuse in creating subtype(s). sample_data = sample.as_series() # Go through all pipelines to submit for this protocol. @@ -254,13 +262,6 @@ def run(prj, args, remaining_args): _LOGGER.debug("Creating %s instance: '%s'", sample_subtype.__name__, sample.sample_name) sample = sample_subtype(sample_data) - pipeline_name, _ = os.path.splitext(pipeline_key) - if sample_subtype != Sample: - # Only rewrite the file if we have a proper subtype. - _LOGGER.debug("Representing sample '%s' on disk as %s", - sample.sample_name, sample_subtype.__name__) - sample.to_yaml(subs_folder_path=prj.metadata.submission_subdir, - pipeline_name=pipeline_name) # The current sample is active. # For each pipeline submission consideration, start fresh. @@ -319,14 +320,16 @@ def run(prj, args, remaining_args): # Sample-level arguments are handled by the pipeline interface. try: argstring = pipeline_interface.get_arg_string( - pipeline_key, sample) - argstring += " " + pipeline_name=pipeline_key, sample=sample, + submission_folder_path=prj.metadata.submission_subdir) except AttributeError: # TODO: inform about which missing attribute(s). fail_message = "Required attribute(s) missing " \ "for pipeline arguments string" _LOGGER.warn("> Not submitted: %s", fail_message) skip_reasons.append(fail_message) + else: + argstring += " " if skip_reasons: # Sample is active, but we've at least 1 pipeline skip reason. @@ -383,7 +386,7 @@ def run(prj, args, remaining_args): # Submit job! _LOGGER.debug("Attempting job submission: '%s' ('%s')", - sample.sample_name, pipeline_name) + sample.sample_name, pl_name) submitted = cluster_submit( sample, prj.compute.submission_template, prj.compute.submission_command, submit_settings, @@ -617,13 +620,13 @@ def _submission_status_text(curr, total, sample_name, sample_library): def cluster_submit( - sample, submit_template, submission_command, variables_dict, - submission_folder, sample_output_folder, pipeline_name, time_delay, - submit=False, dry_run=False, ignore_flags=False, remaining_args=None): + sample, submit_template, submission_command, variables_dict, + submission_folder, sample_output_folder, pipeline_name, time_delay, + submit=False, dry_run=False, ignore_flags=False, remaining_args=None): """ - Submit job to cluster manager. - - :param models.Sample sample: the sample object for submission + Write cluster submission script to disk and submit job for given Sample. + + :param models.Sample sample: the Sample object for submission :param str submit_template: path to submission script template :param str submission_command: actual command with which to execute the submission of the cluster job for the given sample @@ -662,12 +665,10 @@ def cluster_submit( if not os.path.exists(submit_script_dirpath): os.makedirs(submit_script_dirpath) + # Add additional arguments, populate template fields, and write to disk. with open(submit_template, 'r') as handle: filedata = handle.read() - - # Update variable dict with any additional arguments. variables_dict["CODE"] += " " + str(" ".join(remaining_args or [])) - # Fill in submit_template with variables. for key, value in variables_dict.items(): # Here we add brackets around the key names and use uppercase because # this is how they are encoded as variables in the submit templates. @@ -675,17 +676,32 @@ def cluster_submit( with open(submit_script, 'w') as handle: handle.write(filedata) - # Prepare and write sample yaml object - _LOGGER.debug("Writing sample '%s' representation to disk", - sample.sample_name) - sample.to_yaml(subs_folder_path=submission_folder) + # Ensure existence of on-disk representation of this sample. + if type(sample) is Sample: + # run() writes base Sample to disk for each non-skipped sample. + expected_filepath = os.path.join( + submission_folder, "{}.yaml".format(sample.name)) + _LOGGER.debug("Base Sample, to reuse file: '%s'", + expected_filepath) + if not os.path.exists(expected_filepath): + _LOGGER.warn("Missing expected Sample file; creating") + sample.to_yaml(subs_folder_path=submission_folder) + else: + _LOGGER.debug("Base Sample file exists") + else: + # Serialize Sample, generate data for disk, and write. + name_sample_subtype = sample.__class__.__name__ + _LOGGER.debug("Writing %s representation to disk: '%s'", + name_sample_subtype, sample.name) + sample.to_yaml(subs_folder_path=submission_folder) # Check if job is already submitted (unless ignore_flags is set to True) if not ignore_flags: flag_files = glob.glob(os.path.join( sample_output_folder, pipeline_name + "*.flag")) if len(flag_files) > 0: - _LOGGER.info("> Not submitting, flag(s) found: {}".format(flag_files)) + _LOGGER.info("> Not submitting, flag(s) found: {}". + format(flag_files)) submit = False else: pass diff --git a/looper/models.py b/looper/models.py index 0ee4b660..f8faa6e8 100644 --- a/looper/models.py +++ b/looper/models.py @@ -1485,24 +1485,22 @@ def determine_missing_requirements(self): return IOError, missing_message - def is_dormant(self): + def generate_filename(self, delimiter="_"): """ - Determine whether this Sample is inactive. + Create a name for file in which to represent this Sample. - By default, a Sample is regarded as active. That is, if it lacks an - indication about activation status, it's assumed to be active. If, - however, and there's an indication of such status, it must be '1' - in order to be considered switched 'on.' + This uses knowledge of the instance's subtype, sandwiching a delimiter + between the name of this Sample and the name of the subtype before the + extension. If the instance is a base Sample type, then the filename + is simply the sample name with an extension. - :return bool: whether this Sample's been designated as dormant + :param str delimiter: what to place between sample name and name of + subtype; this is only relevant if the instance is of a subclass + :return str: name for file with which to represent this Sample on disk """ - try: - flag = self[SAMPLE_EXECUTION_TOGGLE] - except KeyError: - # Regard default Sample state as active. - return False - # If specified, the activation flag must be set to '1'. - return flag != "1" + base = self.name if type(self) is Sample else \ + "{}{}{}".format(self.name, delimiter, self.__class__.__name__) + return "{}.yaml".format(base) def generate_name(self): @@ -1590,6 +1588,26 @@ def infer_columns(self, implications): implier_name, implier_value) + def is_dormant(self): + """ + Determine whether this Sample is inactive. + + By default, a Sample is regarded as active. That is, if it lacks an + indication about activation status, it's assumed to be active. If, + however, and there's an indication of such status, it must be '1' + in order to be considered switched 'on.' + + :return bool: whether this Sample's been designated as dormant + """ + try: + flag = self[SAMPLE_EXECUTION_TOGGLE] + except KeyError: + # Regard default Sample state as active. + return False + # If specified, the activation flag must be set to '1'. + return flag != "1" + + def locate_data_source(self, data_sources, column_name=DATA_SOURCE_COLNAME, source_key=None, extra_vars=None): """ @@ -1895,17 +1913,16 @@ def set_read_type(self, n=10, permissive=True): feature, self.name) - def to_yaml(self, path=None, subs_folder_path=None, pipeline_name=None): + def to_yaml(self, path=None, subs_folder_path=None, delimiter="_"): """ Serializes itself in YAML format. :param str path: A file path to write yaml to; provide this or the subs_folder_path - :param str pipeline_name: name of a pipeline to which this particular - Sample instance pertains (i.e., perhaps the name of a module - that defined a Sample subclass of which this is an instance) :param str subs_folder_path: path to folder in which to place file that's being written; provide this or a full filepath + :param str delimiter: text to place between the sample name and the + suffix within the filename; irrelevant if there's no suffix :return str: filepath used (same as input if given, otherwise the path value that was inferred) :raises ValueError: if neither full filepath nor path to extant @@ -1923,9 +1940,14 @@ def to_yaml(self, path=None, subs_folder_path=None, pipeline_name=None): "To represent {} on disk, provide a full path or a path " "to a parent (submissions) folder". format(self.__class__.__name__)) - filename = "{}_{}.yaml".format(self.sample_name, pipeline_name) \ - if pipeline_name else "{}.yaml".format(self.sample_name) + _LOGGER.debug("Creating filename for %s: '%s'", + self.__class__.__name__, self.name) + filename = self.generate_filename(delimiter=delimiter) + _LOGGER.debug("Filename: '%s'", filename) path = _os.path.join(subs_folder_path, filename) + + _LOGGER.debug("Setting %s filepath: '%s'", + self.__class__.__name__, path) self.yaml_file = path @@ -2129,15 +2151,28 @@ def file_size_ante(name, data): return rp_data - def get_arg_string(self, pipeline_name, sample): + def get_arg_string(self, pipeline_name, sample, + submission_folder_path="", **null_replacements): """ For a given pipeline and sample, return the argument string :param str pipeline_name: Name of pipeline. :param Sample sample: current sample for which job is being built + :param str submission_folder_path: path to folder in which files + related to submission of this sample will be placed. + :param dict null_replacements: mapping from name of Sample attribute + name to value to use in arg string if Sample attribute's value + is null :return str: command-line argument string for pipeline """ + # It's undesirable to put a null value in the argument string. + default_filepath = _os.path.join( + submission_folder_path, sample.generate_filename()) + _LOGGER.debug("Default sample filepath: '%s'", default_filepath) + proxies = {"yaml_file": default_filepath} + proxies.update(null_replacements) + _LOGGER.debug("Building arguments string") config = self._select_pipeline(pipeline_name) argstring = "" @@ -2151,8 +2186,7 @@ def get_arg_string(self, pipeline_name, sample): for key, value in args.iteritems(): if value is None: - _LOGGER.debug("Null value for opt arg key '%s'", - str(key)) + _LOGGER.debug("Null value for opt arg key '%s'", str(key)) continue try: arg = getattr(sample, value) @@ -2164,6 +2198,18 @@ def get_arg_string(self, pipeline_name, sample): pipeline_name, value, key) raise + # It's undesirable to put a null value in the argument string. + if arg is None: + _LOGGER.debug("Sample is null for attribute: '%s'", value) + try: + arg = proxies[value] + except KeyError: + errmsg = "Can't add null Sample attribute to pipeline " \ + "argument string: '{}'".format(value) + raise ValueError(errmsg) + _LOGGER.debug("Found default for '{}': '{}'". + format(value, arg)) + _LOGGER.debug("Adding '{}' from attribute '{}' for argument '{}'". format(arg, value, key)) argstring += " " + str(key) + " " + str(arg) diff --git a/tests/test_looper.py b/tests/test_looper.py index dfe8bf88..5b11d001 100644 --- a/tests/test_looper.py +++ b/tests/test_looper.py @@ -67,8 +67,7 @@ def test_data_sources_derivation(self, proj, sample_index): merged_columns = filter( lambda col_key: (col_key != "col_modifier") and not col_key.endswith(COL_KEY_SUFFIX), - proj.samples[sample_index].merged_cols.keys() - ) + proj.samples[sample_index].merged_cols.keys()) # Order may be lost due to mapping. # We don't care about that here, or about duplicates. expected = set(DERIVED_COLNAMES) From dbfa23df6afeebf0d003b5d045b3067aba2b53c7 Mon Sep 17 00:00:00 2001 From: Vince Reuter Date: Wed, 21 Jun 2017 22:08:05 -0400 Subject: [PATCH 59/94] fix pipeline interface tests --- .../independent/test_PipelineInterface.py | 17 ++++++++++++++--- 1 file changed, 14 insertions(+), 3 deletions(-) diff --git a/tests/models/independent/test_PipelineInterface.py b/tests/models/independent/test_PipelineInterface.py index 8da94069..09bbc6d6 100644 --- a/tests/models/independent/test_PipelineInterface.py +++ b/tests/models/independent/test_PipelineInterface.py @@ -1,6 +1,7 @@ """ Tests for PipelineInterface ADT. """ import copy +import inspect import itertools import random @@ -8,7 +9,7 @@ import yaml from looper.models import \ - PipelineInterface, _InvalidResourceSpecificationException, \ + PipelineInterface, Sample, _InvalidResourceSpecificationException, \ _MissingPipelineConfigurationException, DEFAULT_COMPUTE_RESOURCES_NAME @@ -98,7 +99,9 @@ def pi_with_resources(request, basic_pipe_iface_data, resources): @pytest.mark.parametrize( argnames="funcname_and_kwargs", argvalues=[("choose_resource_package", {"file_size": 4}), - ("get_arg_string", {"sample": "arbitrary-sample-name"}), + ("get_arg_string", + {"sample": Sample( + {"sample_name": "arbitrary-sample-name"})}), ("get_attribute", {"attribute_key": "irrelevant-attr-name"}), ("get_pipeline_name", {}), @@ -115,9 +118,17 @@ def test_unconfigured_pipeline_exception( except KeyError: # Already no default resource package. pass + + # Each of the functions being tested should take pipeline_name arg, + # and we want to test behavior for the call on an unknown pipeline. funcname, kwargs = funcname_and_kwargs + func = getattr(pi, funcname) + required_parameters = inspect.getargspec(func).args + for parameter in ["pipeline_name", "pipeline"]: + if parameter in required_parameters and parameter not in kwargs: + kwargs[parameter] = "missing-pipeline" with pytest.raises(_MissingPipelineConfigurationException): - getattr(pi, funcname).__call__("missing-pipeline", **kwargs) + func.__call__(**kwargs) From 1bdb4716be086689a87ba97db7204184cfdbdbfd Mon Sep 17 00:00:00 2001 From: Vince Reuter Date: Wed, 21 Jun 2017 22:26:50 -0400 Subject: [PATCH 60/94] trace messages --- looper/models.py | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/looper/models.py b/looper/models.py index f8faa6e8..26d0862e 100644 --- a/looper/models.py +++ b/looper/models.py @@ -170,6 +170,7 @@ def merge_sample(sample, merge_table, data_sources, derived_columns): getattr(sample, SAMPLE_NAME_COLNAME) merge_rows = merge_table[sample_indexer] + _LOGGER.log(5, "%d rows to merge", len(merge_rows)) if len(merge_rows) > 0: # For each row in the merge table of this sample: # 1) populate any derived columns @@ -185,6 +186,7 @@ def merge_sample(sample, merge_table, data_sources, derived_columns): for col in merge_rows.columns: if col == SAMPLE_NAME_COLNAME or \ col not in derived_columns: + _LOGGER.log(5, "Skipping column: '%s'", col) continue # Initialize key in parent dict. col_key = col + COL_KEY_SUFFIX @@ -193,6 +195,7 @@ def merge_sample(sample, merge_table, data_sources, derived_columns): row_dict[col] = sample.locate_data_source( data_sources, col, row_dict[col], row_dict) # 1) + _LOGGER.log(5, "Adding derived columns") # Also add in any derived cols present. for col in derived_columns: # Skip over attributes that the sample @@ -200,6 +203,7 @@ def merge_sample(sample, merge_table, data_sources, derived_columns): # data from the current (row's) data. if not hasattr(sample, col) or \ col in row_dict: + _LOGGER.log(5, "Skipping column: '%s'", col) continue # Map column name key to sample's value # for the attribute given by column name. @@ -219,8 +223,9 @@ def merge_sample(sample, merge_table, data_sources, derived_columns): # and then set to sample attribute. for key, val in row_dict.items(): if key == SAMPLE_NAME_COLNAME or not val: + _LOGGER.log(5, "Skipping KV: {}={}".format(key, val)) continue - _LOGGER.debug("merge: sample '%s'; %s=%s", + _LOGGER.log(5, "merge: sample '%s'; %s=%s", str(sample.name), str(key), str(val)) if not key in merged_cols: new_val = str(val).rstrip() @@ -1028,6 +1033,7 @@ def merge(s): return s else: def merge(s): + _LOGGER.log(5, "Doing column merge: '%s'", s.sample_name) return merge_sample(s, self.merge_table, self.data_sources, self.derived_columns) @@ -1654,7 +1660,8 @@ def locate_data_source(self, data_sources, column_name=DATA_SOURCE_COLNAME, except KeyError: _LOGGER.warn( "Config lacks entry for data_source key: '{}' " - "(in column: '{}')".format(source_key, column_name)) + "in column '{}'; known: {}".format( + source_key, column_name, data_sources.keys())) return "" # Populate any environment variables like $VAR with os.environ["VAR"] From 54c76c4deb2c54ade6f34fc5d464895842623e44 Mon Sep 17 00:00:00 2001 From: nsheff Date: Thu, 22 Jun 2017 09:40:38 -0400 Subject: [PATCH 61/94] Update cluster docs --- doc/source/cluster-computing.rst | 18 +++++++++++------- doc/source/config-files.rst | 8 +++++--- doc/source/pipeline-interface.rst | 7 ++++--- 3 files changed, 20 insertions(+), 13 deletions(-) diff --git a/doc/source/cluster-computing.rst b/doc/source/cluster-computing.rst index 8e382d7d..32ac1bd0 100644 --- a/doc/source/cluster-computing.rst +++ b/doc/source/cluster-computing.rst @@ -3,14 +3,17 @@ Cluster computing ============================================= + By default, looper will build a shell script for each sample and then run each sample serially on the local computer. But where looper really excels is in large projects that require submitting these jobs to a cluster resource manager (like SLURM, SGE, LFS, etc.). Looper handles the interface to the resource manager so that projects and pipelines can be moved to different environments with ease. -To configure looper to use cluster computing, all you have to do is tell looper a few things about your cluster setup: you create a configuration file (`compute_config.yaml`) and point an environment variable (``PEPENV``) to this file, and that's it! Complete, step-by-step instructions and examples are available in the pepenv repository at https://github.com/pepkit/pepenv. +To configure looper to use cluster computing, all you have to do is tell looper a few things about your cluster setup: you create a configuration file (`compute_config.yaml`) and point an environment variable (``PEPENV``) to this file, and that's it! + +Following is a brief overview to familiarize you with how this will work. When you're ready to hook looper up to your compute cluster, you should follow the complete, step-by-step instructions and examples in the pepenv repository at https://github.com/pepkit/pepenv. -Compute config overview +PEPENV overview **************************************** -If you're not quite ready to set it up and just want an overview, here is an example ``compute_config.yaml`` file that works with a SLURM environment: +Here is an example ``compute_config.yaml`` file that works with a SLURM environment: .. code-block:: yaml @@ -31,12 +34,13 @@ The sub-sections below ``compute`` each define a "compute package" that can be a There are two or three sub-parameters for a compute package: - - **submission_template** is a (relative or absolute) path to the template submission script. Templates are described in more detail in the `pepenv readme `_. + - **submission_template** is a (relative or absolute) path to the template submission script. Templates files contain variables that are populated with values for each job you submit. More details are in the `pepenv readme `_. - **submission_command** is the command-line command that `looper` will prepend to the path of the produced submission script to actually run it (`sbatch` for SLURM, `qsub` for SGE, `sh` for localhost, etc). - - **partition** is necessary only if you need to specify a queue name + - **partition** specifies a queue name (optional). -Submission templates +Resources **************************************** -A template uses variables (encoded like `{VARIABLE}`), which will be populated independently for each sample as defined in `pipeline_interface.yaml`. The one variable ``{CODE}`` is a reserved variable that refers to the actual command that will run the pipeline. Otherwise, you can use any variables you define in your `pipeline_interface.yaml`. In `Templates `__ are examples for submission templates for `SLURM `__, `SGE `__, and `local runs `__. You can also create your own templates, giving looper ultimate flexibility to work with any compute infrastructure in any environment. +You may notice that the compute config file does not specify resources to request (like memory, CPUs, or time). Yet, these are required as well in order to submit a job to a cluster. In the looper system, **resources are not handled by the pepenv file** because they not relative to a particular computing environment; instead they are are variable and specific to a pipeline and a sample. As such, these items are defined in the ``pipeline_interface.yaml`` file (``pipelines`` section) that connects looper to a pipeline. The reason for this is that the pipeline developer is the most likely to know what sort of resources her pipeline requires, so she is in the best position to define the resources requested. +For more information on how to adjust resources, see the :ref:`pipeline interface ` documentation. If all the different configuration files seem confusing, now would be a good time to review :doc:`who's who in configuration files `. \ No newline at end of file diff --git a/doc/source/config-files.rst b/doc/source/config-files.rst index da746767..7c25a985 100644 --- a/doc/source/config-files.rst +++ b/doc/source/config-files.rst @@ -2,7 +2,7 @@ Configuration files ========================= -Looper uses `YAML `_ configuration files to describe a project. Looper is a very modular system, so there are few different YAML files. Since it's easy to confuse what the different configuration files are used for, here's an explanation of each. Which ones you need to know about will depend on whether you're a pipeline user (running pipelines on your project) or a pipeline developer (building your own pipeline). +Looper uses `YAML `_ configuration files for several purposes. Looper is designed to be organized, modular, and very configurable, so there are several configuration files. We've organized the configuration files so they each handle a different level of infrastructure: environment, project, sample, or pipeline. This makes the system very adaptable and portable, but for a newcomer, it is easy to confuse what the different configuration files are used for. So, here's an explanation of each for you to use as a reference until you are familiar with the whole ecosystem. Which ones you need to know about will depend on whether you're a pipeline user (running pipelines on your project) or a pipeline developer (building your own pipeline). Pipeline users @@ -14,12 +14,14 @@ Users (non-developers) of pipelines only need to be aware of one or two YAML fil If you are planning to submit jobs to a cluster, then you need to know about a second YAML file: -- :ref:`PEPENV environment config `: This file tells looper how to use compute resource managers, like SLURM. You can find examples and instructions for setting this up at https://github.com/pepkit/pepenv. This file doesn't require much editing or maintenance beyond initial setup. +- :ref:`PEPENV environment config `: This file tells looper how to use compute resource managers, like SLURM. This file doesn't require much editing or maintenance beyond initial setup. + +That should be all you need to worry about as a pipeline user. If you need to adjust compute resources or want to develop a pipeline or have more advanced project-level control over pipelines, then you'll need to know about a few others: Pipeline developers ***************** -If you want to add a new pipeline to looper or tweak the way looper interacts with a pipeline for a given project, then you need to know about a configuration file that coordinates linking your pipeline in to your looper project. +If you want to add a new pipeline to looper, tweak the way looper interacts with a pipeline for a given project, or change the default cluster resources requested by a pipeline, then you need to know about a configuration file that coordinates linking your pipeline in to your looper project. - :doc:`pipeline interface file `: Has two sections: 1) ``protocol_mapping`` tells looper which pipelines exist, and how to map each protocol (sample data type) to its pipelines; 2) ``pipelines`` links looper to the pipelines by describing variables, options and paths that the pipeline needs to know to run and outlines resource requirements for cluster managers. diff --git a/doc/source/pipeline-interface.rst b/doc/source/pipeline-interface.rst index 81d13c28..4e6f7685 100644 --- a/doc/source/pipeline-interface.rst +++ b/doc/source/pipeline-interface.rst @@ -1,12 +1,13 @@ +.. _pipeline-interface-pipelines: Pipeline interface section: pipelines ************************************************** -The ``pipelines`` section specifies to looper which command-line arguments to pass to the pipeline. In addition, if you're using a cluster resource manager, it also specifies which compute resources to request. For each pipeline, you specify variables (some optional and some required). The possible attributes to specify for each pipeline include: +The ``pipelines`` section specifies command-line arguments required by the pipeline. In addition, if you're using a cluster resource manager, it also specifies which compute resources to request. For each pipeline, you specify variables (some optional and some required). The possible attributes to specify for each pipeline include: - ``name`` (recommended): Name of the pipeline. This is used to assess pipeline flags (if your pipeline employs them, like pypiper pipelines). -- ``arguments`` (required): List of key-value pairs of arguments, and attribute sources to pass to the pipeline. The key corresponds verbatim to the string that will be passed on the command line to the pipeline. The value corresponds to an attribute of the sample, which will be derived from the sample_annotation csv file (in other words, it's a column name of your sample annotation sheet). -- ``path`` (required): Absolute or relative path to the script for this pipeline. Relative paths are considered relative to your **pipeline_interface file**. +- ``arguments`` (required): List of key-value pairs of arguments, and attribute sources to pass to the pipeline. The key corresponds verbatim to the string that will be passed on the command line to the pipeline. The value corresponds to an attribute of the sample, which will be derived from the sample_annotation csv file (in other words, it's a column name of your sample annotation sheet). For flag-like arguments that lack a value, you may specify `null` as the value (e.g. `"--quiet-mode": null`). +- ``path`` (required): Absolute or relative path to the script for this pipeline. Relative paths are considered relative to your **pipeline_interface file**. We strongly recommend using relative paths where possible to keep your pipeline interface file portable. You may also use environment variables (like ``${HOME}``) in the ``path``. - ``required_input_files`` (optional): A list of sample attributes (annotation sheets column names) that will point to input files that must exist. - ``all_input_files`` (optional): A list of sample attributes (annotation sheet column names) that will point to input files that are not required, but if they exist, should be counted in the total size calculation for requesting resources. - ``ngs_input_files`` (optional): For pipelines using sequencing data, provide a list of sample attributes (annotation sheet column names) that will point to input files to be used for automatic detection of ``read_length`` and ``read_type`` sample attributes. From be09c636ddaa3de2e4251af2284f84935ffd9ce5 Mon Sep 17 00:00:00 2001 From: Vince Reuter Date: Thu, 22 Jun 2017 10:08:47 -0400 Subject: [PATCH 62/94] put merge and column derivation in proper order --- looper/models.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/looper/models.py b/looper/models.py index 26d0862e..665c90cd 100644 --- a/looper/models.py +++ b/looper/models.py @@ -1044,7 +1044,6 @@ def merge(s): sample.set_genome(self.get("genomes")) sample.set_transcriptome(self.get("transcriptomes")) - sample.set_file_paths(self) # Hack for backwards-compatibility # Pipelines should now use `data_source`) try: @@ -1053,6 +1052,7 @@ def merge(s): _LOGGER.debug("Sample '%s' lacks data source; skipping " "data path assignment", sample.sample_name) sample = merge(sample) + sample.set_file_paths(self) samples.append(sample) return samples From 075ec4bac548d570872f410030b161e78d066810 Mon Sep 17 00:00:00 2001 From: Vince Reuter Date: Thu, 22 Jun 2017 11:54:43 -0400 Subject: [PATCH 63/94] cleaner handling of merge call from Sample builder; return the columns not the Sample; enable dedent; downgrade messages --- looper/looper.py | 6 +- looper/models.py | 221 +++++++++++++++++++++++++---------------------- 2 files changed, 122 insertions(+), 105 deletions(-) diff --git a/looper/looper.py b/looper/looper.py index 4178ef7e..a2f296fd 100755 --- a/looper/looper.py +++ b/looper/looper.py @@ -293,12 +293,12 @@ def run(prj, args, remaining_args): # Check if single_or_paired value is recognized. if hasattr(sample, "read_type"): - # Drop "-end", "_end", or just "end" from end of the column value. + # Drop "-end", "_end", or "end" from end of the column value. sample.read_type = re.sub( '[_\\-]?end$', '', str(sample.read_type)).lower() if sample.read_type not in valid_read_types: - skip_reasons.append( - "read_type must be in {}".format(valid_read_types)) + skip_reasons.append("read_type must be in {}". + format(valid_read_types)) # Identify cluster resources required for this submission. submit_settings = pipeline_interface.choose_resource_package( diff --git a/looper/models.py b/looper/models.py index 665c90cd..9f3cfb90 100644 --- a/looper/models.py +++ b/looper/models.py @@ -158,89 +158,101 @@ def merge_sample(sample, merge_table, data_sources, derived_columns): :param merge_table: data with which to alter Sample :param Mapping data_sources: collection of named paths to data locations :param derived_columns: names of columns with data-derived value - :return Sample: updated input instance + :return Set[str]: names of columns that were merged """ + merged_cols = {} + + if not merge_table: + _LOGGER.debug("No data for sample merge, skipping") + return merged_cols + if SAMPLE_NAME_COLNAME not in merge_table.columns: raise KeyError( "Merge table requires a column named '{}'.". format(SAMPLE_NAME_COLNAME)) + _LOGGER.debug("Merging Sample with data sources: {}". + format(data_sources)) + _LOGGER.debug("Merging Sample with derived columns: {}". + format(derived_columns)) + sample_indexer = merge_table[SAMPLE_NAME_COLNAME] == \ getattr(sample, SAMPLE_NAME_COLNAME) merge_rows = merge_table[sample_indexer] + if len(merge_rows) == 0: + _LOGGER.debug("No merge rows for sample '%s', skipping", sample.name) + return merged_cols + _LOGGER.log(5, "%d rows to merge", len(merge_rows)) - if len(merge_rows) > 0: - # For each row in the merge table of this sample: - # 1) populate any derived columns - # 2) derived columns --> space-delimited strings - # 3) update the sample values with the merge table - - # Keep track of merged cols, - # so we don't re-derive them later. - merged_cols = { - key: "" for key in merge_rows.columns} - for _, row in merge_rows.iterrows(): - row_dict = row.to_dict() - for col in merge_rows.columns: - if col == SAMPLE_NAME_COLNAME or \ - col not in derived_columns: - _LOGGER.log(5, "Skipping column: '%s'", col) - continue - # Initialize key in parent dict. - col_key = col + COL_KEY_SUFFIX - merged_cols[col_key] = "" - row_dict[col_key] = row_dict[col] - row_dict[col] = sample.locate_data_source( - data_sources, col, row_dict[col], row_dict) # 1) - - _LOGGER.log(5, "Adding derived columns") - # Also add in any derived cols present. - for col in derived_columns: - # Skip over attributes that the sample - # either lacks, and those covered by the - # data from the current (row's) data. - if not hasattr(sample, col) or \ - col in row_dict: - _LOGGER.log(5, "Skipping column: '%s'", col) - continue - # Map column name key to sample's value - # for the attribute given by column name. - col_key = col + COL_KEY_SUFFIX - row_dict[col_key] = getattr(sample, col) - # Map the column name itself to the - # populated data source template string. - row_dict[col] = sample.locate_data_source( - data_sources, col, getattr(sample, col), row_dict) - _LOGGER.debug("PROBLEM adding derived column: " - "{}, {}, {}".format(col, row_dict[col], - getattr(sample, col))) - - # Since we are now jamming multiple (merged) - # entries into a single attribute, we have to - # join them into a space-delimited string - # and then set to sample attribute. - for key, val in row_dict.items(): - if key == SAMPLE_NAME_COLNAME or not val: - _LOGGER.log(5, "Skipping KV: {}={}".format(key, val)) - continue - _LOGGER.log(5, "merge: sample '%s'; %s=%s", - str(sample.name), str(key), str(val)) - if not key in merged_cols: - new_val = str(val).rstrip() - else: - new_val = "{} {}".format( - merged_cols[key], str(val)).strip() - merged_cols[key] = new_val # 2) - # Don't update sample_name. - merged_cols.pop(SAMPLE_NAME_COLNAME, None) + # For each row in the merge table of this sample: + # 1) populate any derived columns + # 2) derived columns --> space-delimited strings + # 3) update the sample values with the merge table + # Keep track of merged cols, + # so we don't re-derive them later. + merged_cols = {key: "" for key in merge_rows.columns} + for _, row in merge_rows.iterrows(): + row_dict = row.to_dict() + for col in merge_rows.columns: + if col == SAMPLE_NAME_COLNAME or \ + col not in derived_columns: + _LOGGER.log(5, "Skipping column: '%s'", col) + continue + # Initialize key in parent dict. + col_key = col + COL_KEY_SUFFIX + merged_cols[col_key] = "" + row_dict[col_key] = row_dict[col] + row_dict[col] = sample.locate_data_source( + data_sources, col, row_dict[col], row_dict) # 1) + + _LOGGER.log(5, "Adding derived columns") + # Also add in any derived cols present. + for col in derived_columns: + # Skip over attributes that the sample + # either lacks, and those covered by the + # data from the current (row's) data. + if not hasattr(sample, col) or \ + col in row_dict: + _LOGGER.log(5, "Skipping column: '%s'", col) + continue + # Map column name key to sample's value + # for the attribute given by column name. + col_key = col + COL_KEY_SUFFIX + row_dict[col_key] = getattr(sample, col) + # Map the column name itself to the + # populated data source template string. + row_dict[col] = sample.locate_data_source( + data_sources, col, getattr(sample, col), row_dict) + _LOGGER.debug("PROBLEM adding derived column: " + "{}, {}, {}".format(col, row_dict[col], + getattr(sample, col))) + + # Since we are now jamming multiple (merged) + # entries into a single attribute, we have to + # join them into a space-delimited string + # and then set to sample attribute. + for key, val in row_dict.items(): + if key == SAMPLE_NAME_COLNAME or not val: + _LOGGER.log(5, "Skipping KV: {}={}".format(key, val)) + continue + _LOGGER.log(5, "merge: sample '%s'; %s=%s", + str(sample.name), str(key), str(val)) + if not key in merged_cols: + new_val = str(val).rstrip() + else: + new_val = "{} {}".format( + merged_cols[key], str(val)).strip() + merged_cols[key] = new_val # 2) + + # Don't update sample_name. + merged_cols.pop(SAMPLE_NAME_COLNAME, None) - sample.update(merged_cols) # 3) - sample.merged = True # mark sample as merged - sample.merged_cols = merged_cols - sample.merged = True + sample.update(merged_cols) # 3) + sample.merged_cols = merged_cols + sample.merged = True return sample @@ -1027,16 +1039,6 @@ def _make_basic_samples(self): else: _LOGGER.debug("No merge table") - # Define merge behavior based on presence of merge table. - if self.merge_table is None: - def merge(s): - return s - else: - def merge(s): - _LOGGER.log(5, "Doing column merge: '%s'", s.sample_name) - return merge_sample(s, self.merge_table, self.data_sources, - self.derived_columns) - # Create the Sample(s). samples = [] for _, row in self.sheet.iterrows(): @@ -1044,15 +1046,19 @@ def merge(s): sample.set_genome(self.get("genomes")) sample.set_transcriptome(self.get("transcriptomes")) + sample.set_file_paths(self) # Hack for backwards-compatibility # Pipelines should now use `data_source`) + _LOGGER.debug("Setting sample's data path") try: sample.data_path = sample.data_source except AttributeError: _LOGGER.debug("Sample '%s' lacks data source; skipping " "data path assignment", sample.sample_name) - sample = merge(sample) - sample.set_file_paths(self) + else: + _LOGGER.debug("Path to sample data: '%s'", sample.data_source) + merge_sample(sample, self.merge_table, + self.data_sources, self.derived_columns) samples.append(sample) return samples @@ -1454,18 +1460,21 @@ def determine_missing_requirements(self): # First, attributes missing, empty = [], [] for file_attribute in self.required_inputs_attr: - _LOGGER.debug("Checking '{}'".format(file_attribute)) + _LOGGER.log(5, "Checking '{}'".format(file_attribute)) try: attval = getattr(self, file_attribute) except AttributeError: - _LOGGER.debug("Missing required input attribute '%s'", + _LOGGER.log(5, "Missing required input attribute '%s'", file_attribute) missing.append(file_attribute) continue if attval == "": - _LOGGER.debug("Empty required input attribute '%s'", + _LOGGER.log(5, "Empty required input attribute '%s'", file_attribute) empty.append(file_attribute) + else: + _LOGGER.log(5, "'{}' is valid: '{}'". + format(file_attribute, attval)) if missing or empty: return AttributeError, \ @@ -1475,11 +1484,12 @@ def determine_missing_requirements(self): # Second, files missing_files = [] for paths in self.required_inputs: + _LOGGER.log(5, "Text to split and check paths: '%s'", paths) # There can be multiple, space-separated values here. for path in paths.split(" "): - _LOGGER.debug("Checking path: '{}'".format(path)) + _LOGGER.log(5, "Checking path: '{}'".format(path)) if not _os.path.exists(path): - _LOGGER.debug("Missing required input file: '{}'". + _LOGGER.log(5, "Missing required input file: '{}'". format(path)) missing_files.append(path) @@ -1801,26 +1811,34 @@ def set_pipeline_attributes( # Settings ending in _attr are lists of attribute keys. # These attributes are then queried to populate values # for the primary entries. - self.ngs_inputs_attr = pipeline_interface.get_attribute( - pipeline_name, "ngs_input_files") - self.required_inputs_attr = pipeline_interface.get_attribute( - pipeline_name, "required_input_files") - # Ensure input_size is present. - self.all_inputs_attr = pipeline_interface.get_attribute( - pipeline_name, "all_input_files") or self.required_inputs_attr - - # Convert attribute keys into values + req_attr_names = [("ngs_input_files", "ngs_inputs_attr"), + ("required_input_files", "required_inputs_attr"), + ("all_input_files", "all_inputs_attr")] + for name_src_attr, name_dst_attr in req_attr_names: + _LOGGER.log(5, "Value of '%s' will be assigned to '%s'", + name_src_attr, name_dst_attr) + value = pipeline_interface.get_attribute( + pipeline_name, name_src_attr) + _LOGGER.log(5, "Assigning '{}': {}".format(name_dst_attr, value)) + setattr(self, name_dst_attr, value) + + # Post-processing of input attribute assignments. + # Ensure that there's a valid all_inputs_attr. + if not self.all_inputs_attr: + self.all_inputs_attr = self.required_inputs_attr + # Convert attribute keys into values. if self.ngs_inputs_attr: - _LOGGER.debug("Handling NGS input attributes: '%s'", self.name) + _LOGGER.log(5, "Handling NGS input attributes: '%s'", self.name) # NGS data inputs exit, so we can add attributes like # read_type, read_length, paired. self.ngs_inputs = self.get_attr_values("ngs_inputs_attr") self.set_read_type(permissive=permissive) else: - _LOGGER.debug("No NGS inputs: '%s'", self.name) + _LOGGER.log(5, "No NGS inputs: '%s'", self.name) + + # Assign values for actual inputs attributes. self.required_inputs = self.get_attr_values("required_inputs_attr") self.all_inputs = self.get_attr_values("all_inputs_attr") - self.input_file_size = get_file_size(self.all_inputs) @@ -2207,13 +2225,12 @@ def get_arg_string(self, pipeline_name, sample, # It's undesirable to put a null value in the argument string. if arg is None: - _LOGGER.debug("Sample is null for attribute: '%s'", value) + _LOGGER.debug("Null value for Sample attribute: '%s'", value) try: arg = proxies[value] except KeyError: - errmsg = "Can't add null Sample attribute to pipeline " \ - "argument string: '{}'".format(value) - raise ValueError(errmsg) + raise ValueError("No default for null " + "Sample attribute: '{}'".format(value)) _LOGGER.debug("Found default for '{}': '{}'". format(value, arg)) From 04036e3c11f2a3d69344197c9be6c0bc2c03df23 Mon Sep 17 00:00:00 2001 From: Vince Reuter Date: Thu, 22 Jun 2017 17:03:45 -0400 Subject: [PATCH 64/94] MERGE HAPPENS BEFORE COLUMN DERIVATION (via set_file_paths); account for null merge table to facilitate more freedom in how the merge step is called; better messaging about parsing of the annotations file; modularization of test fixtures; tests cleanup; new tests for Project ctor behavior --- looper/models.py | 43 +++++++-- tests/conftest.py | 81 +++++++++++++++-- tests/models/independent/test_Project.py | 108 ++++++++++++++++++++--- tests/test_looper.py | 8 +- 4 files changed, 207 insertions(+), 33 deletions(-) diff --git a/looper/models.py b/looper/models.py index 9f3cfb90..c5612cdd 100644 --- a/looper/models.py +++ b/looper/models.py @@ -157,13 +157,14 @@ def merge_sample(sample, merge_table, data_sources, derived_columns): :param Sample sample: sample to modify via merge table data :param merge_table: data with which to alter Sample :param Mapping data_sources: collection of named paths to data locations - :param derived_columns: names of columns with data-derived value + :param Iterable[str] derived_columns: names of column for which + corresponding Sample attribute's value is data-derived :return Set[str]: names of columns that were merged """ merged_cols = {} - if not merge_table: + if merge_table is None: _LOGGER.debug("No data for sample merge, skipping") return merged_cols @@ -185,8 +186,11 @@ def merge_sample(sample, merge_table, data_sources, derived_columns): _LOGGER.debug("No merge rows for sample '%s', skipping", sample.name) return merged_cols + # Hash derived columns for faster lookup in case of many samples/columns. + derived_columns = set(derived_columns) _LOGGER.log(5, "%d rows to merge", len(merge_rows)) + # For each row in the merge table of this sample: # 1) populate any derived columns # 2) derived columns --> space-delimited strings @@ -206,7 +210,7 @@ def merge_sample(sample, merge_table, data_sources, derived_columns): merged_cols[col_key] = "" row_dict[col_key] = row_dict[col] row_dict[col] = sample.locate_data_source( - data_sources, col, row_dict[col], row_dict) # 1) + data_sources, col, row_dict[col], row_dict) # 1) _LOGGER.log(5, "Adding derived columns") # Also add in any derived cols present. @@ -225,7 +229,7 @@ def merge_sample(sample, merge_table, data_sources, derived_columns): # Map the column name itself to the # populated data source template string. row_dict[col] = sample.locate_data_source( - data_sources, col, getattr(sample, col), row_dict) + data_sources, col, getattr(sample, col), row_dict) _LOGGER.debug("PROBLEM adding derived column: " "{}, {}, {}".format(col, row_dict[col], getattr(sample, col))) @@ -615,6 +619,9 @@ def __init__(self, config_file, subproject=None, _LOGGER.info("Using subproject: '{}'".format(subproject)) self.parse_config_file(subproject) + # Ensure data_sources is at least set if it wasn't parsed. + self.setdefault("data_sources", None) + self.name = self.infer_name(self.config_file) self.subproject = subproject @@ -641,7 +648,26 @@ def __init__(self, config_file, subproject=None, self.metadata.pipelines_dir)) self.interfaces_by_protocol = \ process_pipeline_interfaces(self.metadata.pipelines_dir) - self.sheet = check_sheet(self.metadata.sample_annotation) + + path_anns_file = self.metadata.sample_annotation + _LOGGER.debug("Reading sample annotations sheet: '%s'", path_anns_file) + try: + self.sheet = check_sheet(path_anns_file) + except IOError: + _LOGGER.error("Alleged annotations file doesn't exist: '%s'", + path_anns_file) + anns_folder_path = _os.path.dirname(path_anns_file) + try: + annotations_file_folder_contents = \ + _os.listdir(anns_folder_path) + except OSError: + _LOGGER.error("Annotations file folder doesn't exist either: " + "'%s'", anns_folder_path) + else: + _LOGGER.error("Annotations file folder's contents: {}". + format(annotations_file_folder_contents)) + raise + self.merge_table = None self._samples = None if defer_sample_construction \ else self._make_basic_samples() @@ -1046,6 +1072,8 @@ def _make_basic_samples(self): sample.set_genome(self.get("genomes")) sample.set_transcriptome(self.get("transcriptomes")) + merge_sample(sample, self.merge_table, + self.data_sources, self.derived_columns) sample.set_file_paths(self) # Hack for backwards-compatibility # Pipelines should now use `data_source`) @@ -1057,8 +1085,6 @@ def _make_basic_samples(self): "data path assignment", sample.sample_name) else: _LOGGER.debug("Path to sample data: '%s'", sample.data_source) - merge_sample(sample, self.merge_table, - self.data_sources, self.derived_columns) samples.append(sample) return samples @@ -1076,6 +1102,9 @@ def parse_config_file(self, subproject=None): with open(self.config_file, 'r') as conf_file: config = yaml.safe_load(conf_file) + _LOGGER.debug("{} config data: {}".format( + self.__class__.__name__, config)) + # Parse yaml into the project's attributes. _LOGGER.debug("Adding attributes for {}: {}".format( self.__class__.__name__, config.keys())) diff --git a/tests/conftest.py b/tests/conftest.py index b05c4bdf..cde80f02 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -16,6 +16,7 @@ from pandas.io.parsers import EmptyDataError import pytest +import yaml from looper import setup_looper_logger from looper.models import PipelineInterface @@ -26,6 +27,8 @@ _LOGGER = logging.getLogger("looper") +P_CONFIG_FILENAME = "project_config.yaml" + # {basedir} lines are formatted during file write; other braced entries remain. PROJECT_CONFIG_LINES = """metadata: sample_annotation: samples.csv @@ -194,6 +197,12 @@ def pytest_generate_tests(metafunc): +@pytest.fixture(scope="function") +def sample_annotation_lines(): + return SAMPLE_ANNOTATION_LINES + + + @pytest.fixture(scope="session", autouse=True) def conf_logs(request): """ Configure logging for the testing session. """ @@ -244,7 +253,7 @@ def interactive(prj_lines=PROJECT_CONFIG_LINES, dirpath = tempfile.mkdtemp() path_conf_file = _write_temp( prj_lines, - dirpath=dirpath, fname="project_config.yaml") + dirpath=dirpath, fname=P_CONFIG_FILENAME) path_iface_file = _write_temp( iface_lines, dirpath=dirpath, fname="pipeline_interface.yaml") @@ -297,8 +306,8 @@ def _write_temp(lines, dirpath, fname): **{"derived_column_names": ", ".join(DERIVED_COLNAMES)} ) filepath = os.path.join(dirpath, fname) - _LOGGER.debug("Writing %d lines to file '%s'", len(lines), filepath) data_source_formatter = string.Formatter() + num_lines = 0 with open(filepath, 'w') as tmpf: for l in lines: if "{basedir}" in l: @@ -308,7 +317,67 @@ def _write_temp(lines, dirpath, fname): l = data_source_formatter.vformat( l, (), derived_columns_replacement) tmpf.write(l) - return tmpf.name + num_lines += 1 + _LOGGER.debug("Wrote %d line(s) to disk: '%s'", num_lines, filepath) + return filepath + + + +@pytest.fixture(scope="function") +def project_config_lines(): + """ Provide safer iteration over the lines for Project config file. """ + return PROJECT_CONFIG_LINES + + + +@pytest.fixture(scope="function") +def path_project_conf(tmpdir, project_config_lines): + """ + Write the Project configuration data. + + :param py.path.local.LocalPath tmpdir: temporary Path fixture + :param Iterable[str] project_config_lines: collection of lines for + Project configuration file + :return str: path to file with Project configuration data + """ + return _write_temp( + project_config_lines, tmpdir.strpath, P_CONFIG_FILENAME) + + + +@pytest.fixture(scope="function") +def proj_conf_data(path_project_conf): + """ + Read and parse raw Project configuration data. + + :param str path_project_conf: path to file with Project configuration data + :return Mapping: the data parsed from the configuration file written, + a Mapping form of the raw Project config text lines + """ + with open(path_project_conf, 'r') as conf_file: + return yaml.safe_load(conf_file) + + + +@pytest.fixture(scope="function") +def path_sample_anns(tmpdir, sample_annotation_lines): + """ + Write the sample annotations file and return the path to it. + + :param py.path.local.LocalPath tmpdir: temporary Path fixture + :param Iterable[str] sample_annotation_lines: collection of lines for + the sample annotations files + :return str: path to the sample annotations file that was written + """ + filepath = _write_temp( + sample_annotation_lines, tmpdir.strpath, ANNOTATIONS_FILENAME) + return filepath + + + +@pytest.fixture(scope="function") +def p_conf_fname(): + return P_CONFIG_FILENAME @@ -323,7 +392,7 @@ def write_project_files(request): """ dirpath = tempfile.mkdtemp() path_conf_file = _write_temp(PROJECT_CONFIG_LINES, - dirpath=dirpath, fname="project_config.yaml") + dirpath=dirpath, fname=P_CONFIG_FILENAME) path_merge_table_file = _write_temp( MERGE_TABLE_LINES, dirpath=dirpath, fname=MERGE_TABLE_FILENAME @@ -400,7 +469,7 @@ def request_class_attribute(req, attr): -def _create(request, data_type): +def _create(request, data_type, **kwargs): """ Create instance of desired type, using file in request class. @@ -413,7 +482,7 @@ def _create(request, data_type): _LOGGER.debug("Using %s as source of data to build %s", data_source, data_type.__class__.__name__) try: - return data_type(data_source) + return data_type(data_source, **kwargs) except EmptyDataError: with open(data_source, 'r') as datafile: _LOGGER.error("File contents:\n{}".format(datafile.readlines())) diff --git a/tests/models/independent/test_Project.py b/tests/models/independent/test_Project.py index 293f7086..d079786e 100644 --- a/tests/models/independent/test_Project.py +++ b/tests/models/independent/test_Project.py @@ -8,7 +8,7 @@ import yaml import looper from looper.models import \ - AttributeDict, Project, \ + AttributeDict, Project, Sample, \ _MissingMetadataException, SAMPLE_ANNOTATIONS_KEY @@ -42,6 +42,98 @@ def pytest_generate_tests(metafunc): +class ProjectConstructorTests: + """ Tests of Project constructor, particularly behavioral details. """ + + + @pytest.mark.parametrize( + argnames="spec_type", argvalues=["as_null", "missing"], + ids=lambda spec: "spec_type={}".format(spec)) + @pytest.mark.parametrize( + argnames="lazy", argvalues=[False, True], + ids=lambda lazy: "lazy={}".format(lazy)) + def test_no_merge_table_in_config( + self, tmpdir, spec_type, lazy, + proj_conf_data, path_sample_anns): + """ Merge table attribute remains null if config lacks merge_table. """ + metadata = proj_conf_data["metadata"] + try: + assert "merge_table" in metadata + except AssertionError: + print("Project metadata section lacks 'merge_table'") + print("All config data: {}".format(proj_conf_data)) + print("Config metadata section: {}".format(metadata)) + raise + if spec_type == "as_null": + metadata["merge_table"] = None + elif spec_type == "missing": + del metadata["merge_table"] + else: + raise ValueError("Unknown way to specify no merge table: {}". + format(spec_type)) + path_config_file = os.path.join(tmpdir.strpath, "project_config.yaml") + with open(path_config_file, 'w') as conf_file: + yaml.safe_dump(proj_conf_data, conf_file) + p = Project(path_config_file, defer_sample_construction=lazy) + assert p.merge_table is None + + + @pytest.mark.skip("Not implemented") + def test_merge_table_construction( + self, tmpdir, project_config_data): + """ Merge table is constructed iff samples are constructed. """ + # TODO: implement + pass + + + def test_counting_samples_doesnt_create_samples( + self, sample_annotation_lines, + path_project_conf, path_sample_anns): + """ User can ask about sample count without creating samples. """ + # We're not parameterized in terms of Sample creation laziness here + # because a piece of the test's essence is Sample collection absence. + p = Project(path_project_conf, defer_sample_construction=True) + assert p._samples is None + expected_sample_count = sum(1 for _ in sample_annotation_lines) - 1 + assert expected_sample_count == p.num_samples + assert p._samples is None + + + @pytest.mark.parametrize(argnames="lazy", argvalues=[False, True]) + def test_sample_creation_laziness( + self, path_project_conf, path_sample_anns, lazy): + """ Project offers control over whether to create base Sample(s). """ + + p = Project(path_project_conf, defer_sample_construction=lazy) + + if lazy: + # Samples should remain null during lazy Project construction. + assert p._samples is None + + else: + # Eager Project construction builds Sample objects. + assert p._samples is not None + with open(path_sample_anns, 'r') as anns_file: + anns_file_lines = anns_file.readlines() + + # Sum excludes the header line. + num_samples_expected = sum(1 for l in anns_file_lines[1:] if l) + assert num_samples_expected == len(p._samples) + assert all([Sample == type(s) for s in p._samples]) + + + @pytest.mark.parametrize(argnames="lazy", argvalues=[False, True]) + def test_sample_name_availability( + self, path_project_conf, path_sample_anns, lazy): + """ Sample names always available on Project. """ + with open(path_sample_anns, 'r') as anns_file: + expected_sample_names = \ + [l.split(",")[0] for l in anns_file.readlines()[1:] if l] + p = Project(path_project_conf, defer_sample_construction=lazy) + assert expected_sample_names == list(p.sample_names) + + + class ProjectRequirementsTests: """ Tests for a Project's set of requirements. """ @@ -574,8 +666,6 @@ def _env_paths_to_names(envs): """ reduced = {} for env_name, env_data in envs.items(): - # DEBUG - print(env_name) reduced[env_name] = _compute_paths_to_names(env_data) return reduced @@ -594,15 +684,5 @@ def _compute_paths_to_names(env): """ reduced = copy.deepcopy(env) for pathvar in ["submission_template"]: - - # DEBUG - try: - _, reduced[pathvar] = os.path.split(reduced[pathvar]) - except KeyError: - print("REDUCED: {}".format(reduced)) - print("ENV: {}".format(env)) - print("KEYS: {}".format(reduced.keys())) - print("ENV KEYS: {}".format(env.keys())) - raise - + _, reduced[pathvar] = os.path.split(reduced[pathvar]) return reduced diff --git a/tests/test_looper.py b/tests/test_looper.py index 5b11d001..e3e55ccb 100644 --- a/tests/test_looper.py +++ b/tests/test_looper.py @@ -14,13 +14,12 @@ import numpy.random as nprand import pytest -import yaml from looper.looper import aggregate_exec_skip_reasons import looper.models -from looper.models import AttributeDict, ATTRDICT_METADATA, COL_KEY_SUFFIX +from looper.models import COL_KEY_SUFFIX from .conftest import \ - DERIVED_COLNAMES, EXPECTED_MERGED_SAMPLE_FILES, FILE_BY_SAMPLE, \ + DERIVED_COLNAMES, EXPECTED_MERGED_SAMPLE_FILES, \ LOOPER_ARGS_BY_PIPELINE, MERGED_SAMPLE_INDICES, NGS_SAMPLE_INDICES, \ NUM_SAMPLES, PIPELINE_TO_REQD_INFILES_BY_SAMPLE @@ -32,9 +31,6 @@ @pytest.mark.usefixtures("write_project_files") class ProjectConstructorTest: - # TODO: docstrings and atomicity/encapsulation. - # TODO: conversion to pytest for consistency. - @pytest.mark.parametrize(argnames="attr_name", argvalues=["required_inputs", "all_input_attr"]) From 73038e95365fbfad4af23a1b5fa7c7de9f89287b Mon Sep 17 00:00:00 2001 From: Vince Reuter Date: Fri, 23 Jun 2017 12:43:56 -0400 Subject: [PATCH 65/94] handle Sample subtype resolution and pipeline module imports; first-pass at path expansion for pipeline interface key; groundwork for raw mapping to ProtocolInterface --- looper/models.py | 179 +++++++++++++++++++++++++++++++---------------- looper/utils.py | 11 +++ 2 files changed, 131 insertions(+), 59 deletions(-) diff --git a/looper/models.py b/looper/models.py index c5612cdd..6ec922d5 100644 --- a/looper/models.py +++ b/looper/models.py @@ -66,8 +66,8 @@ import yaml from .utils import \ - alpha_cased, check_bam, check_fastq, get_file_size, \ - import_from_source, parse_ftype, partition + alpha_cased, check_bam, check_fastq, expandpath, \ + get_file_size, import_from_source, parse_ftype, partition COMPUTE_SETTINGS_VARNAME = "PEPENV" @@ -271,8 +271,8 @@ def process_pipeline_interfaces(pipeline_interface_locations): pipeline interface and protocol mappings information. Each such file should be have a pipelines section and a protocol mappings section whereas each folder should have a file for each of those sections. - :return Mapping[str, ProtocolInterface]: mapping from protocol name to - interface(s) for which that protocol is mapped + :return Mapping[str, Iterable[ProtocolInterface]]: mapping from protocol + name to interface(s) for which that protocol is mapped """ interface_by_protocol = defaultdict(list) for pipe_iface_location in pipeline_interface_locations: @@ -860,8 +860,8 @@ def build_submission_bundles(self, protocol, priority=True): this_protocol_pipelines = proto_iface.fetch(protocol) if not this_protocol_pipelines: - _LOGGER.warn("No mapping for protocol '%s' in '%s', skipping", - protocol, proto_iface.location) + _LOGGER.warn("No mapping for protocol '%s' in %s", + protocol, proto_iface) continue # TODO: update once dependency-encoding logic is in place. @@ -904,7 +904,7 @@ def build_submission_bundles(self, protocol, priority=True): _LOGGER.debug("{} new scripts for protocol {} from " "pipeline(s) location '{}': {}". format(len(new_scripts), protocol, - proto_iface.location, new_scripts)) + proto_iface.source, new_scripts)) new_jobs = [proto_iface.create_submission_bundle(pipeline_key, protocol) @@ -2295,10 +2295,10 @@ def get_arg_string(self, pipeline_name, sample, def get_attribute(self, pipeline_name, attribute_key, path_as_list=True): """ - Return value of given attribute for named pipeline. + Return the value of the named attribute for the pipeline indicated. :param str pipeline_name: name of the pipeline of interest - :param str attribute_key: name of the attribute of interest + :param str attribute_key: name of the pipeline attribute of interest :param bool path_as_list: whether to ensure that a string attribute is returned as a list; this is useful for safe iteration over the returned value. @@ -2370,58 +2370,62 @@ class ProtocolInterface(object): single project. Also stored are path attributes with information about the location(s) from which the PipelineInterface and ProtocolMapper came. - :param location: location (e.g., code repository) of pipelines - :type location: str + :param interface_data_source: location (e.g., code repository) of pipelines + :type interface_data_source: str """ SUBTYPE_MAPPING_SECTION = "sample_subtypes" - def __init__(self, location): - + def __init__(self, interface_data_source): super(ProtocolInterface, self).__init__() - if _os.path.isdir(location): - self.location = location - self.pipe_iface_path = _os.path.join( - location, "config", "pipeline_interface.yaml") - self.pipe_iface = PipelineInterface(self.pipe_iface_path) - self.protomap = ProtocolMapper(_os.path.join( - location, "config", "protocol_mappings.yaml")) - self.pipelines_path = _os.path.join(location, "pipelines") + if isinstance(interface_data_source, Mapping): + # TODO: for implementation, we need to determine pipelines_path. + raise NotImplementedError( + "Raw Mapping as source of {} data is not yet supported". + format(self.__class__.__name__)) + _LOGGER.debug("Creating %s from raw Mapping", + self.__class__.__name__) + self.source = None + self.pipe_iface_path = None + for name, value in self._parse_iface_data(interface_data_source): + setattr(self, name, value) - elif _os.path.isfile(location): + elif _os.path.isfile(interface_data_source): # Secondary version that passes combined yaml file directly, - # instead of relying on separate hard-coded config names as above - self.location = None - self.pipe_iface_path = location - self.pipelines_path = _os.path.dirname(location) - - with open(location, 'r') as interface_file: + # instead of relying on separate hard-coded config names. + _LOGGER.debug("Creating %s from file: '%s'", + self.__class__.__name__, interface_data_source) + self.source = interface_data_source + self.pipe_iface_path = self.source + self.pipelines_path = _os.path.dirname(self.source) + + with open(interface_data_source, 'r') as interface_file: iface = yaml.load(interface_file) - try: - if "protocol_mapping" in iface: - self.protomap = ProtocolMapper(iface["protocol_mapping"]) - else: - raise Exception("pipeline_interface file is missing " - "a 'protocol_mapping' section.") - if "pipelines" in iface: - self.pipe_iface = PipelineInterface(iface["pipelines"]) - else: - raise Exception("pipeline_interface file is missing " - "a 'pipelines' section.") - except Exception as e: - _LOGGER.error(str(iface)) - raise e + for name, value in self._parse_iface_data(iface): + setattr(self, name, value) + + elif _os.path.isdir(interface_data_source): + _LOGGER.debug("Creating %s from files in directory: '%s'", + self.__class__.__name__, interface_data_source) + self.source = interface_data_source + self.pipe_iface_path = _os.path.join( + self.source, "config", "pipeline_interface.yaml") + self.pipelines_path = _os.path.join(self.source, "pipelines") + + self.pipe_iface = PipelineInterface(self.pipe_iface_path) + self.protomap = ProtocolMapper(_os.path.join( + self.source, "config", "protocol_mappings.yaml")) else: raise ValueError("Alleged pipelines location '{}' exists neither " - "as a file nor as a folder.".format(location)) + "as a file nor as a folder.".format(interface_data_source)) def __repr__(self): - return "ProtocolInterface from '{}'".format(self.location) + return "ProtocolInterface from '{}'".format(self.source or "Mapping") def create_submission_bundle(self, pipeline_key, protocol): @@ -2448,15 +2452,24 @@ def create_submission_bundle(self, pipeline_key, protocol): except KeyError: _LOGGER.debug("%s from '%s' doesn't define section '%s' " "for pipeline '%s'", - self.pipe_iface.__class__.__name__, self.location, + self.pipe_iface.__class__.__name__, self.source, self.SUBTYPE_MAPPING_SECTION, strict_pipe_key) - subtype = Sample + # Without a subtypes section, if pipeline module defines a single + # Sample subtype, we'll assume that type is to be used when in + # this case, when the interface section for this pipeline lacks + # an explicit subtypes section specification. + subtype_name = None else: - if isinstance(subtypes, str): + if subtypes is None: + _LOGGER.debug("Null Sample subtypes specified for pipeline: " + "'%s'; using base Sample type", strict_pipe_key) + # Designate lack of need to attempt pipeline module import. + subtype = Sample + elif isinstance(subtypes, str): subtype_name = subtypes _LOGGER.debug("Single subtype name for pipeline '%s' " "in interface from '%s': '%s'", subtype_name, - strict_pipe_key, self.location) + strict_pipe_key, self.source) else: try: temp_subtypes = {alpha_cased(p): st @@ -2466,7 +2479,7 @@ def create_submission_bundle(self, pipeline_key, protocol): subtype = Sample _LOGGER.debug("No %s subtype specified in interface from " "'%s': '%s', '%s'; known: %s", - subtype.__name__, self.location, + subtype.__name__, self.source, strict_pipe_key, protocol, ", ".join(temp_subtypes.keys())) @@ -2489,6 +2502,22 @@ def fetch(self, protocol): return self.protomap.mappings.get(alpha_cased(protocol)) + @classmethod + def _parse_iface_data(cls, pipe_iface_data): + assignments = [("protocol_mapping", ProtocolMapper, "protomap"), + ("pipelines", PipelineInterface, "pipe_iface")] + attribute_values = [] + for section_name, data_type, attr_name in assignments: + try: + data = pipe_iface_data[section_name] + except KeyError: + _LOGGER.error("Error creating %s from data: %s", + cls.__name__, str(pipe_iface_data)) + raise Exception("PipelineInterface file lacks section: '{}'". + format(section_name)) + attribute_values.append((attr_name, data_type(data))) + return attribute_values + def pipeline_key_to_path(self, pipeline_key): """ @@ -2509,6 +2538,9 @@ def pipeline_key_to_path(self, pipeline_key): if self.pipe_iface.get_attribute(strict_pipeline_key, "path"): script_path_only = self.pipe_iface.get_attribute( strict_pipeline_key, "path")[0].strip() + _LOGGER.log(5, "Expanding path: '%s'", script_path_only) + script_path_only = expandpath(script_path_only) + _LOGGER.log(5, "Expanded: '%s'", script_path_only) script_path_with_flags = \ " ".join([script_path_only, pipeline_key_args]) else: @@ -2669,18 +2701,21 @@ def __init__(self, subtype_name, pipeline_filepath): -def _import_sample_subtype(pipeline_filepath, subtype_name): +def _import_sample_subtype(pipeline_filepath, subtype_name=None): """ Import a particular Sample subclass from a Python module. :param str pipeline_filepath: path to file to regard as Python module - :param str subtype_name: name of the target class; this must derive from - the base Sample class. + :param str subtype_name: name of the target class (which must derive from + the base Sample class in order for it to be used), optional; if + unspecified, if the module defines a single subtype, then that will + be used; otherwise, the base Sample type will be used. :return type: the imported class, defaulting to base Sample in case of failure with the import or other logic :raises _UndefinedSampleSubtypeException: if the module is imported but type indicated by subtype_name is not found as a class """ + from argparse import ArgumentError base_type = Sample _, modname = _os.path.split(pipeline_filepath) @@ -2691,6 +2726,11 @@ def _import_sample_subtype(pipeline_filepath, subtype_name): "calling it {}".format(pipeline_filepath, modname)) pipeline_module = import_from_source( name=modname, module_filepath=pipeline_filepath) + except (ArgumentError, SystemExit): + _LOGGER.warn("'%s' appears to attempt to run on import; " + "does it lack a conditional on __main__? Using base %s", + base_type.__name__) + return base_type except ImportError as e: _LOGGER.warn("Using base %s because of failure in attempt to " "import pipeline module: %s", base_type.__name__, e) @@ -2704,6 +2744,7 @@ def _import_sample_subtype(pipeline_filepath, subtype_name): def class_names(cs): return ", ".join([c.__name__ for c in cs]) + # Find classes from pipeline module and determine which derive from Sample. classes = inspect.getmembers( pipeline_module, lambda obj: inspect.isclass(obj)) classes = [klazz for _, klazz in classes] @@ -2712,13 +2753,33 @@ def class_names(cs): _LOGGER.debug("%d %s subtype(s): %s", len(sample_subtypes), base_type.__name__, class_names(sample_subtypes)) - for st in sample_subtypes: - if st.__name__ == subtype_name: - _LOGGER.debug("Successfully imported %s from '%s'", - subtype_name, pipeline_filepath) - return st - raise _UndefinedSampleSubtypeException( - subtype_name=subtype_name, pipeline_filepath=pipeline_filepath) + # Determine course of action based on subtype request and number found. + if not subtype_name: + _LOGGER.debug("No specific subtype is requested from '%s'", + pipeline_filepath) + if len(sample_subtypes) == 1: + # No specific request and single subtype --> use single subtype. + subtype = sample_subtypes[0] + _LOGGER.debug("Single %s subtype found in '%s': '%s'", + base_type.__name__, pipeline_filepath, + subtype.__name__) + return subtype + else: + # We can't arbitrarily select from among 0 or multiple subtypes. + _LOGGER.debug("%s subtype cannot be selected from %d in '%s'; " + "using base type", base_type.__name__, + len(sample_subtypes), pipeline_filepath) + return base_type + else: + # Specific subtype request --> look for match. + for st in sample_subtypes: + if st.__name__ == subtype_name: + _LOGGER.debug("Successfully imported %s from '%s'", + subtype_name, pipeline_filepath) + return st + _LOGGER.warn("No subtype from '%s' matches '%s'; using base: %s", + pipeline_filepath, subtype_name, base_type.__name__) + return base_type diff --git a/looper/utils.py b/looper/utils.py index ea47a36e..720833d6 100644 --- a/looper/utils.py +++ b/looper/utils.py @@ -73,6 +73,17 @@ def check_fastq(fastq, o): +def expandpath(path): + """ + Expand a filesystem path that may or may not contain user/env vars. + + :param str path: path to expand + :return str: expanded version of input path + """ + return os.path.expandvars(os.path.expanduser(path)).replace("//", "/") + + + def fetch_package_classes(pkg, predicate=None): """ Enable single-depth fetch of package's classes if not exported. From bae47dfc486f96395108f0e21e178defa80be325 Mon Sep 17 00:00:00 2001 From: Vince Reuter Date: Fri, 23 Jun 2017 13:11:06 -0400 Subject: [PATCH 66/94] fix import handling and improve exception catching; microtest working on 2.7, 3.4, 3.5 --- looper/models.py | 5 ++--- looper/utils.py | 2 +- 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/looper/models.py b/looper/models.py index 6ec922d5..d454adcd 100644 --- a/looper/models.py +++ b/looper/models.py @@ -2715,7 +2715,6 @@ def _import_sample_subtype(pipeline_filepath, subtype_name=None): :raises _UndefinedSampleSubtypeException: if the module is imported but type indicated by subtype_name is not found as a class """ - from argparse import ArgumentError base_type = Sample _, modname = _os.path.split(pipeline_filepath) @@ -2726,12 +2725,12 @@ def _import_sample_subtype(pipeline_filepath, subtype_name=None): "calling it {}".format(pipeline_filepath, modname)) pipeline_module = import_from_source( name=modname, module_filepath=pipeline_filepath) - except (ArgumentError, SystemExit): + except SystemExit: _LOGGER.warn("'%s' appears to attempt to run on import; " "does it lack a conditional on __main__? Using base %s", base_type.__name__) return base_type - except ImportError as e: + except Exception as e: _LOGGER.warn("Using base %s because of failure in attempt to " "import pipeline module: %s", base_type.__name__, e) return base_type diff --git a/looper/utils.py b/looper/utils.py index 720833d6..dc17b6a0 100644 --- a/looper/utils.py +++ b/looper/utils.py @@ -147,7 +147,7 @@ def import_from_source(name, module_filepath): if sys.version_info >= (3, 5): from importlib import util as _il_util - modspec = _il_util.spec_from_file_module_filepath( + modspec = _il_util.spec_from_file_location( name, module_filepath) mod = _il_util.module_from_spec(modspec) modspec.loader.exec_module(mod) From 5588b0eebbb279b0691ee443367f9a951791e7c3 Mon Sep 17 00:00:00 2001 From: nsheff Date: Fri, 23 Jun 2017 13:55:06 -0400 Subject: [PATCH 67/94] Fix doc bugs --- doc/source/changelog.rst | 3 +-- doc/source/conf.py | 1 + doc/source/config-files.rst | 2 +- doc/source/define-your-project.rst | 4 ++-- doc/source/inputs.rst | 3 --- ...terface-mapping.rst => pipeline-interface-mapping.rst.inc} | 4 +++- ...ace-pipelines.rst => pipeline-interface-pipelines.rst.inc} | 2 ++ doc/source/pipeline-interface.rst | 4 ++-- doc/source/{project-config.rst => project-config.rst.inc} | 2 ++ ...e-annotation-sheet.rst => sample-annotation-sheet.rst.inc} | 1 + doc/source/tutorials.rst | 2 +- 11 files changed, 16 insertions(+), 12 deletions(-) delete mode 100644 doc/source/inputs.rst rename doc/source/{pipeline-interface-mapping.rst => pipeline-interface-mapping.rst.inc} (98%) rename doc/source/{pipeline-interface-pipelines.rst => pipeline-interface-pipelines.rst.inc} (99%) rename doc/source/{project-config.rst => project-config.rst.inc} (99%) rename doc/source/{sample-annotation-sheet.rst => sample-annotation-sheet.rst.inc} (99%) diff --git a/doc/source/changelog.rst b/doc/source/changelog.rst index bc80e555..d44606ca 100644 --- a/doc/source/changelog.rst +++ b/doc/source/changelog.rst @@ -27,8 +27,7 @@ Changelog - Various small bug fixes and dev improvements. - - Require `setuptools` for installation, and `pandas 0.20.2`. If `numexpr` is installed, - version `2.6.2` is required. + - Require `setuptools` for installation, and `pandas 0.20.2`. If `numexpr` is installed, version `2.6.2` is required. - **v0.5** (*2017-03-01*): diff --git a/doc/source/conf.py b/doc/source/conf.py index b7f13245..2037083e 100644 --- a/doc/source/conf.py +++ b/doc/source/conf.py @@ -140,6 +140,7 @@ # relative to this directory. They are copied after the builtin static files, # so a file named "default.css" will overwrite the builtin "default.css". html_static_path = ['_static'] +html_static_path = [] # it's empty; suppress warning # Add any extra paths that contain custom files (such as robots.txt or # .htaccess) here, relative to this directory. These files are copied diff --git a/doc/source/config-files.rst b/doc/source/config-files.rst index 95264447..4c5d2d08 100644 --- a/doc/source/config-files.rst +++ b/doc/source/config-files.rst @@ -19,7 +19,7 @@ If you are planning to submit jobs to a cluster, then you need to know about a s That should be all you need to worry about as a pipeline user. If you need to adjust compute resources or want to develop a pipeline or have more advanced project-level control over pipelines, then you'll need to know about a few others: Pipeline developers -***************** +********************** If you want to add a new pipeline to looper, tweak the way looper interacts with a pipeline for a given project, or change the default cluster resources requested by a pipeline, then you need to know about a configuration file that coordinates linking your pipeline in to your looper project. diff --git a/doc/source/define-your-project.rst b/doc/source/define-your-project.rst index af9d543d..4a6e884e 100644 --- a/doc/source/define-your-project.rst +++ b/doc/source/define-your-project.rst @@ -40,8 +40,8 @@ For example, by default, your jobs will run serially on your local computer, whe Let's go through the more advanced details of both annotation sheets and project config files: -.. include:: sample-annotation-sheet.rst +.. include:: sample-annotation-sheet.rst.inc -.. include:: project-config.rst +.. include:: project-config.rst.inc diff --git a/doc/source/inputs.rst b/doc/source/inputs.rst deleted file mode 100644 index 07c26983..00000000 --- a/doc/source/inputs.rst +++ /dev/null @@ -1,3 +0,0 @@ -Required Inputs -============================================= - diff --git a/doc/source/pipeline-interface-mapping.rst b/doc/source/pipeline-interface-mapping.rst.inc similarity index 98% rename from doc/source/pipeline-interface-mapping.rst rename to doc/source/pipeline-interface-mapping.rst.inc index 114fd199..61a2be74 100644 --- a/doc/source/pipeline-interface-mapping.rst +++ b/doc/source/pipeline-interface-mapping.rst.inc @@ -1,4 +1,6 @@ -.. _pipeline-interface-mapping: +:orphan: + +.. _pi_mapping: Pipeline interface section: protocol_mapping ******************************************** diff --git a/doc/source/pipeline-interface-pipelines.rst b/doc/source/pipeline-interface-pipelines.rst.inc similarity index 99% rename from doc/source/pipeline-interface-pipelines.rst rename to doc/source/pipeline-interface-pipelines.rst.inc index 4cceeb3f..0e23cc7b 100644 --- a/doc/source/pipeline-interface-pipelines.rst +++ b/doc/source/pipeline-interface-pipelines.rst.inc @@ -1,3 +1,5 @@ +:orphan: + .. _pipeline-interface-pipelines: Pipeline interface section: pipelines diff --git a/doc/source/pipeline-interface.rst b/doc/source/pipeline-interface.rst index f7bae756..0a7888ef 100644 --- a/doc/source/pipeline-interface.rst +++ b/doc/source/pipeline-interface.rst @@ -31,7 +31,7 @@ Let's start with a very simple example. A basic ``pipeline_interface.yaml`` file The first section specifies that samples of protocol ``RRBS`` will be mapped to the pipeline specified by key ``rrbs_pipeline``. The second section describes where the pipeline named ``rrbs_pipeline`` is located and what command-line arguments it requires. Pretty simple. Let's go through each of these sections in more detail: -.. include:: pipeline-interface-mapping.rst +.. include:: pipeline-interface-mapping.rst.inc -.. include:: pipeline-interface-pipelines.rst +.. include:: pipeline-interface-pipelines.rst.inc diff --git a/doc/source/project-config.rst b/doc/source/project-config.rst.inc similarity index 99% rename from doc/source/project-config.rst rename to doc/source/project-config.rst.inc index 10695637..62e76b15 100644 --- a/doc/source/project-config.rst +++ b/doc/source/project-config.rst.inc @@ -1,3 +1,5 @@ +:orphan: + Project config file *************************************************** diff --git a/doc/source/sample-annotation-sheet.rst b/doc/source/sample-annotation-sheet.rst.inc similarity index 99% rename from doc/source/sample-annotation-sheet.rst rename to doc/source/sample-annotation-sheet.rst.inc index f6ea5c5d..a3464595 100644 --- a/doc/source/sample-annotation-sheet.rst +++ b/doc/source/sample-annotation-sheet.rst.inc @@ -1,3 +1,4 @@ +:orphan: Sample annotation sheet ************************************************** diff --git a/doc/source/tutorials.rst b/doc/source/tutorials.rst index 425e1644..0b918829 100644 --- a/doc/source/tutorials.rst +++ b/doc/source/tutorials.rst @@ -41,7 +41,7 @@ Inside there will be two directories: - ``submissions`` [2]_ - which holds yaml representations of the samples and log files of the submited jobs. -The sample-specific output of each pipeline type varies and is described in :doc:`pipelines`. +The sample-specific output of each pipeline type varies. To use pre-made pipelines with your project, all you have to do is :doc:`define your project ` using looper's standard format. To link your own, custom built pipelines, you can :doc:`connect your pipeline to looper with a pipeline interface `. From b190e0202154876b3e9fb1b98f0d32e18f0da6b7 Mon Sep 17 00:00:00 2001 From: nsheff Date: Fri, 23 Jun 2017 13:57:15 -0400 Subject: [PATCH 68/94] Add examples folder back in --- examples/microtest_merge_table.csv | 4 + examples/microtest_project_config.yaml | 94 ++++++++++++++++++++++++ examples/microtest_sample_annotation.csv | 16 ++++ 3 files changed, 114 insertions(+) create mode 100644 examples/microtest_merge_table.csv create mode 100644 examples/microtest_project_config.yaml create mode 100644 examples/microtest_sample_annotation.csv diff --git a/examples/microtest_merge_table.csv b/examples/microtest_merge_table.csv new file mode 100644 index 00000000..2bcf237f --- /dev/null +++ b/examples/microtest_merge_table.csv @@ -0,0 +1,4 @@ +sample_name,data_source,file_number +rrbs,microtest_merge,1 +wgbs,microtest_merge,1 +wgbs,microtest_merge,2 diff --git a/examples/microtest_project_config.yaml b/examples/microtest_project_config.yaml new file mode 100644 index 00000000..1109d790 --- /dev/null +++ b/examples/microtest_project_config.yaml @@ -0,0 +1,94 @@ +# This project config file describes all *project-specific variables* +# Its primary purpose as as input to Looper, which will submit jobs as appropriate +# for each sample in the project. +# But it is also read by other tools, including: +# - project sample loop (primary purpose) +# - make_trackhubs scripts to produce web accessible results +# - stats summary scripts +# - analysis scripts requiring pointers to metadata, results, and other options. + +metadata: + # output_dir: ABSOLUTE PATH to the parent, shared space where project results go + output_dir: /scratch/lab_bock/shared/projects/microtest + # results and submission subdirs are subdirectors directories under parent output_dir + # results: where output sample folders will go + # submission: where cluster submit scripts and log files will go + results_subdir: results_pipeline + submission_subdir: submission + # pipelines_dir: ABSOLUTE PATH the directory where the Looper will find pipeline + # scripts (and accompanying pipeline config files) for submission. + pipelines_dir: $CODEBASE/pipelines + # Elements in this section can be absolute or relative. + # Typically, this project config file is stored with the project metadata, so + # relative paths are considered relative to this project config file. + # sample_annotation: one-row-per-sample metadata + sample_annotation: microtest_sample_annotation.csv + # merge_table: input for samples with more than one input file + merge_table: microtest_merge_table.csv + # compare_table: comparison pairs or groups, like normalization samples + compare_table: null.csv + + +# a list of annotation sheet columns that are "derived" +# the values in these are constructed using a regex-like expression +# of variables (defined in the next section). +derived_columns: [data_source] + + +data_sources: + # specify the ABSOLUTE PATH of input files using variable path expressions + # entries correspond to values in the data_source column in sample_annotation table + # {variable} can be used to replace environment variables or other sample_annotation columns + # If you use {variable} codes, you should quote the field so python can parse it. + bsf_samples: "{RAWDATA}{flowcell}/{flowcell}_{lane}_samples/{flowcell}_{lane}#{BSF_name}.bam" + microtest: "/data/groups/lab_bock/shared/resources/microtest/{sample_name}.bam" + microtest_merge: "/data/groups/lab_bock/shared/resources/microtest/{sample_name}{file_number}.bam" + + +subprojects: + config_test: + pipeline_config: + wgbs.py: wgbs_ds.yaml + + +genomes: + human: hg19 + mouse: mm10 + +transcriptomes: + human: hg19_cdna + mouse: mm10_cdna + + +pipeline_config: + # pipeline configuration files used in project. + # Key string must match the _name of the pipeline script_ (including extension) + # Relative paths are relative to this project config file. + # Default (null) means use the generic config for the pipeline. + # wgbs.py: null + # Or you can point to a specific config to be used in this project: + # rrbs.py: rrbs_config.yaml + # wgbs.py: wgbs_config.yaml + # cgps: cpgs_config.yaml + + +pipeline_args: + rnaBitSeq.py: + "-w": 50 + + +trackhubs: + trackhub_dir: /data/groups/lab_bock/public_html/arendeiro/microtest/ + # url: if you include this, the make_trackhubs will produce a link to your track hub in the project folder. + url: http://www.whatever.com/ + matrix_x: cell_type + matrix_y: cell_count + sort_order: cell_type=+ + parent_track_name: ews_rrbs + visibility: dense + hub_name: ews_hub + short_label_column: sample_name + email: arendeiro@cemm.oeaw.ac.at + +username: user +email: user@email.com diff --git a/examples/microtest_sample_annotation.csv b/examples/microtest_sample_annotation.csv new file mode 100644 index 00000000..bc9b1b49 --- /dev/null +++ b/examples/microtest_sample_annotation.csv @@ -0,0 +1,16 @@ +sample_name,library,organism,ip,data_source +atac-seq_PE,ATAC-seq,human,,microtest +atac-seq_SE,ATAC-seq,human,,microtest +chip-seq_PE,CHIP-seq,human,H3K27ac,microtest +chip-seq_SE,CHIP-seq,human,H3K27ac,microtest +chipmentation_PE,ChIPmentation,human,H3K27ac,microtest +chipmentation_SE,ChIPmentation,human,H3K27ac,microtest +cpgseq_example_data,CpG-seq,human,,microtest +quant-seq_SE,Quant-seq,human,,microtest +rrbs,RRBS,human,,microtest +rrbs_PE,RRBS,human,,microtest +wgbs,WGBS,human,,microtest +RNA_TRUseq_50SE,SMART,human,,microtest +RNA_SMART_50SE,SMART,human,,microtest +rrbs_PE_fq,RRBS,human,,microtest +rrbs_fq,RRBS,human,,microtest From 53f286846c2c17256100e5837a94cca6eb6c71ea Mon Sep 17 00:00:00 2001 From: nsheff Date: Fri, 23 Jun 2017 14:25:34 -0400 Subject: [PATCH 69/94] update examples to implied cols --- examples/microtest_project_config.yaml | 24 ++++++++---------------- 1 file changed, 8 insertions(+), 16 deletions(-) diff --git a/examples/microtest_project_config.yaml b/examples/microtest_project_config.yaml index 1109d790..c603fbe6 100644 --- a/examples/microtest_project_config.yaml +++ b/examples/microtest_project_config.yaml @@ -1,11 +1,4 @@ # This project config file describes all *project-specific variables* -# Its primary purpose as as input to Looper, which will submit jobs as appropriate -# for each sample in the project. -# But it is also read by other tools, including: -# - project sample loop (primary purpose) -# - make_trackhubs scripts to produce web accessible results -# - stats summary scripts -# - analysis scripts requiring pointers to metadata, results, and other options. metadata: # output_dir: ABSOLUTE PATH to the parent, shared space where project results go @@ -50,15 +43,14 @@ subprojects: pipeline_config: wgbs.py: wgbs_ds.yaml - -genomes: - human: hg19 - mouse: mm10 - -transcriptomes: - human: hg19_cdna - mouse: mm10_cdna - +implied_columns: + organism: + human: + genomes: hg19 + transcriptome: hg19_cdna + mouse: + genome: mm10 + transcriptome: mm10_cdna pipeline_config: # pipeline configuration files used in project. From 780063c526bb666fb2065ad537beb1ef08c33663 Mon Sep 17 00:00:00 2001 From: nsheff Date: Fri, 23 Jun 2017 14:28:40 -0400 Subject: [PATCH 70/94] remove docs from example config and simplify --- examples/microtest_project_config.yaml | 55 +------------------------- 1 file changed, 2 insertions(+), 53 deletions(-) diff --git a/examples/microtest_project_config.yaml b/examples/microtest_project_config.yaml index c603fbe6..1cc64b65 100644 --- a/examples/microtest_project_config.yaml +++ b/examples/microtest_project_config.yaml @@ -1,43 +1,18 @@ -# This project config file describes all *project-specific variables* - metadata: - # output_dir: ABSOLUTE PATH to the parent, shared space where project results go output_dir: /scratch/lab_bock/shared/projects/microtest - # results and submission subdirs are subdirectors directories under parent output_dir - # results: where output sample folders will go - # submission: where cluster submit scripts and log files will go results_subdir: results_pipeline submission_subdir: submission - # pipelines_dir: ABSOLUTE PATH the directory where the Looper will find pipeline - # scripts (and accompanying pipeline config files) for submission. pipelines_dir: $CODEBASE/pipelines - # Elements in this section can be absolute or relative. - # Typically, this project config file is stored with the project metadata, so - # relative paths are considered relative to this project config file. - # sample_annotation: one-row-per-sample metadata sample_annotation: microtest_sample_annotation.csv - # merge_table: input for samples with more than one input file merge_table: microtest_merge_table.csv - # compare_table: comparison pairs or groups, like normalization samples - compare_table: null.csv - -# a list of annotation sheet columns that are "derived" -# the values in these are constructed using a regex-like expression -# of variables (defined in the next section). derived_columns: [data_source] - data_sources: - # specify the ABSOLUTE PATH of input files using variable path expressions - # entries correspond to values in the data_source column in sample_annotation table - # {variable} can be used to replace environment variables or other sample_annotation columns - # If you use {variable} codes, you should quote the field so python can parse it. bsf_samples: "{RAWDATA}{flowcell}/{flowcell}_{lane}_samples/{flowcell}_{lane}#{BSF_name}.bam" microtest: "/data/groups/lab_bock/shared/resources/microtest/{sample_name}.bam" microtest_merge: "/data/groups/lab_bock/shared/resources/microtest/{sample_name}{file_number}.bam" - subprojects: config_test: pipeline_config: @@ -53,34 +28,8 @@ implied_columns: transcriptome: mm10_cdna pipeline_config: - # pipeline configuration files used in project. - # Key string must match the _name of the pipeline script_ (including extension) - # Relative paths are relative to this project config file. - # Default (null) means use the generic config for the pipeline. - # wgbs.py: null - # Or you can point to a specific config to be used in this project: - # rrbs.py: rrbs_config.yaml - # wgbs.py: wgbs_config.yaml - # cgps: cpgs_config.yaml - + rrbs.py: rrbs_config.yaml pipeline_args: rnaBitSeq.py: - "-w": 50 - - -trackhubs: - trackhub_dir: /data/groups/lab_bock/public_html/arendeiro/microtest/ - # url: if you include this, the make_trackhubs will produce a link to your track hub in the project folder. - url: http://www.whatever.com/ - matrix_x: cell_type - matrix_y: cell_count - sort_order: cell_type=+ - parent_track_name: ews_rrbs - visibility: dense - hub_name: ews_hub - short_label_column: sample_name - email: arendeiro@cemm.oeaw.ac.at - -username: user -email: user@email.com + "-w": 50 \ No newline at end of file From ce9ade1d03b28072313ef544bb4709df4f6264f1 Mon Sep 17 00:00:00 2001 From: Vince Reuter Date: Fri, 23 Jun 2017 15:32:53 -0400 Subject: [PATCH 71/94] enforce __repr__ implementation and validate call safety --- looper/models.py | 7 +++++ tests/models/test_models_smoke.py | 45 +++++++++++++++++++++++++++++++ 2 files changed, 52 insertions(+) diff --git a/looper/models.py b/looper/models.py index d454adcd..4ad0b25c 100644 --- a/looper/models.py +++ b/looper/models.py @@ -70,6 +70,13 @@ get_file_size, import_from_source, parse_ftype, partition +# TODO: decide if we want to denote functions for export. +__functions__ = [] +__classes__ = ["AttributeDict", "PipelineInterface", "Project", + "ProtocolInterface", "ProtocolMapper", "Sample"] +__all__ = __functions__ + __classes__ + + COMPUTE_SETTINGS_VARNAME = "PEPENV" DEFAULT_COMPUTE_RESOURCES_NAME = "default" DATA_SOURCE_COLNAME = "data_source" diff --git a/tests/models/test_models_smoke.py b/tests/models/test_models_smoke.py index 97dd411e..9d3aefdb 100644 --- a/tests/models/test_models_smoke.py +++ b/tests/models/test_models_smoke.py @@ -1,19 +1,64 @@ """ Basic smoketests for models """ +import inspect +import logging import pytest +import looper from looper.models import AttributeDict + __author__ = "Vince Reuter" __email__ = "vreuter@virgnia.edu" +_LOGGER = logging.getLogger(__name__) + + def pytest_generate_tests(metafunc): """ Dynamic test case parameterization. """ if metafunc.cls == AttributeDictRepresentationSmokeTests: metafunc.parametrize(argnames="representation_method", argvalues=["__repr__", "__str__"]) + elif metafunc.cls == ObjectRepresentationSmokeTests: + metafunc.parametrize(argnames="class_name", + argvalues=looper.models.__classes__) + metafunc.parametrize(argnames="method_name", argvalues=["__repr__"]) + + + +class ObjectRepresentationSmokeTests: + """ Tests for the text representation of important ADTs. """ + + + def test_implements_repr_smoke(self, class_name, method_name): + """ Each important ADT must implement a representation method. """ + + # Attempt a control assertion, that a subclass that doesn't override + # the given method of its superclass, uses the superclass version of + # the function in question. + class ObjectSubclass(object): + def __init__(self): + super(ObjectSubclass, self).__init__() + try: + subclass_version = getattr(ObjectSubclass, "__repr__") + superclass_version = getattr(object, method_name) + except AttributeError: + _LOGGER.debug("No object subclass vs. object validation for " + "method: '%s'", method_name) + else: + assert subclass_version is superclass_version + + # Make the actual assertion of interest. + adt = getattr(looper.models, class_name) + assert getattr(adt, method_name) != \ + getattr(adt.__bases__[0], method_name) + + def test_repr_smoke(self, class_name, method_name): + """ Object representation method successfully returns string. """ + # TODO: with pytest.raises(None) in 3.1+ + assert str is type(getattr(class_name, method_name).__call__()) @pytest.mark.usefixtures("write_project_files") From 6fbb7f5acc5cc92ea276c44c33b430ee06d4b774 Mon Sep 17 00:00:00 2001 From: Vince Reuter Date: Fri, 23 Jun 2017 20:27:32 -0400 Subject: [PATCH 72/94] better handling of path expansion in PipelineInterface; tests for contract between models classes to provide representation of self; tests for path expansion --- looper/models.py | 18 +- .../independent/test_PipelineInterface.py | 213 ++++++++++++++++-- .../independent/test_ProtocolInterface.py | 17 ++ tests/models/test_models_smoke.py | 1 + 4 files changed, 229 insertions(+), 20 deletions(-) create mode 100644 tests/models/independent/test_ProtocolInterface.py diff --git a/looper/models.py b/looper/models.py index 4ad0b25c..65201a36 100644 --- a/looper/models.py +++ b/looper/models.py @@ -2091,6 +2091,9 @@ def __init__(self, config): with open(config, 'r') as f: self.pipe_iface_config = yaml.load(f) + # Ensure that each pipeline path, if provided, is expanded. + self._expand_paths() + def __getitem__(self, item): try: @@ -2105,13 +2108,23 @@ def __iter__(self): def __repr__(self): - source = self.pipe_iface_file or "mapping" + source = self.pipe_iface_file or "Mapping" num_pipelines = len(self.pipe_iface_config) pipelines = ", ".join(self.pipe_iface_config.keys()) return "{} from {}, with {} pipeline(s): {}".format( self.__class__.__name__, source, num_pipelines, pipelines) + def _expand_paths(self): + for pipe_data in self.pipe_iface_config.values(): + if "path" in pipe_data: + pipe_path = pipe_data["path"] + _LOGGER.log(5, "Expanding path: '%s'", pipe_path) + pipe_path = expandpath(pipe_path) + _LOGGER.log(5, "Expanded: '%s'", pipe_path) + pipe_data["path"] = pipe_path + + @property def pipeline_names(self): """ @@ -2545,9 +2558,6 @@ def pipeline_key_to_path(self, pipeline_key): if self.pipe_iface.get_attribute(strict_pipeline_key, "path"): script_path_only = self.pipe_iface.get_attribute( strict_pipeline_key, "path")[0].strip() - _LOGGER.log(5, "Expanding path: '%s'", script_path_only) - script_path_only = expandpath(script_path_only) - _LOGGER.log(5, "Expanded: '%s'", script_path_only) script_path_with_flags = \ " ".join([script_path_only, pipeline_key_args]) else: diff --git a/tests/models/independent/test_PipelineInterface.py b/tests/models/independent/test_PipelineInterface.py index 09bbc6d6..3368f9dc 100644 --- a/tests/models/independent/test_PipelineInterface.py +++ b/tests/models/independent/test_PipelineInterface.py @@ -3,8 +3,16 @@ import copy import inspect import itertools +import logging +import os import random +import sys +if sys.version_info < (3, 3): + from collections import Iterable, Mapping +else: + from collections.abc import Iterable, Mapping +import mock import pytest import yaml @@ -17,6 +25,9 @@ __email__ = "vreuter@virginia.edu" +_LOGGER = logging.getLogger(__name__) + + # Values with which to build pipeline interface keys and names PIPELINE_NAMES = ["ATACseq", "WGBS"] EXTENSIONS = [".py", ".sh", ".R"] @@ -37,10 +48,18 @@ def pytest_generate_tests(metafunc): try: parameters = metafunc.cls.PARAMETERS except AttributeError: + _LOGGER.debug("No indirect parameterization for test class: '{}'". + format(metafunc.cls)) pass else: for name, values in parameters.items(): metafunc.parametrize(argnames=name, argvalues=values) + if metafunc.cls == ConstructorPathParsingTests: + # Provide test case with two PipelineInterface config bundles. + metafunc.parametrize( + argnames="piface_config_bundles", + argvalues=[(atacseq_iface_without_resources(), + {"name": "sans-path"})]) @@ -62,21 +81,6 @@ def resources(): -@pytest.mark.parametrize(argnames="from_file", argvalues=[False, True]) -def test_constructor_input_types(tmpdir, from_file, basic_pipe_iface_data): - """ PipelineInterface constructor handles Mapping or filepath. """ - if from_file: - pipe_iface_config = tmpdir.join("pipe-iface-conf.yaml").strpath - with open(tmpdir.join("pipe-iface-conf.yaml").strpath, 'w') as f: - yaml.safe_dump(basic_pipe_iface_data, f) - else: - pipe_iface_config = basic_pipe_iface_data - pi = PipelineInterface(pipe_iface_config) - assert basic_pipe_iface_data == pi.pipe_iface_config - assert pi.pipe_iface_file == (pipe_iface_config if from_file else None) - - - @pytest.fixture(scope="function") def pi_with_resources(request, basic_pipe_iface_data, resources): """ Add resource bundle data to each config section. """ @@ -96,6 +100,21 @@ def pi_with_resources(request, basic_pipe_iface_data, resources): +@pytest.mark.parametrize(argnames="from_file", argvalues=[False, True]) +def test_constructor_input_types(tmpdir, from_file, basic_pipe_iface_data): + """ PipelineInterface constructor handles Mapping or filepath. """ + if from_file: + pipe_iface_config = tmpdir.join("pipe-iface-conf.yaml").strpath + with open(tmpdir.join("pipe-iface-conf.yaml").strpath, 'w') as f: + yaml.safe_dump(basic_pipe_iface_data, f) + else: + pipe_iface_config = basic_pipe_iface_data + pi = PipelineInterface(pipe_iface_config) + assert basic_pipe_iface_data == pi.pipe_iface_config + assert pi.pipe_iface_file == (pipe_iface_config if from_file else None) + + + @pytest.mark.parametrize( argnames="funcname_and_kwargs", argvalues=[("choose_resource_package", {"file_size": 4}), @@ -158,7 +177,8 @@ def test_get_pipeline_name_inferred(self): pipelines = [name + ext for name, ext in zip(pipeline_names, extensions)] pi_config_data = {pipeline: None for pipeline in pipelines} - pi = PipelineInterface(pi_config_data) + with mock.patch("looper.models.PipelineInterface._expand_paths"): + pi = PipelineInterface(pi_config_data) for expected_name, pipeline in zip(pipeline_names, pipelines): assert expected_name == pi.get_pipeline_name(pipeline) @@ -341,6 +361,167 @@ def test_file_size_spec_required_for_non_default_packages( +@pytest.fixture(scope="function") +def atacseq_iface_without_resources(): + """ + Provide the ATAC-Seq pipeline interface as a fixture, without resources. + + Note that this represents the configuration data for the interface for a + single pipeline. In order to use this in the form that a PipelineInterface + expects, this needs to be the value to which a key is mapped within a + larger Mapping. + + :return Mapping: all of the pipeline interface configuration data for + ATAC-Seq, minus the resources section + """ + return { + "name": "ATACseq", + "looper_args": True, + "required_input_files": ["read1", "read2"], + "all_input_files": ["read1", "read2"], + "ngs_input_files": ["read1", "read2"], + "arguments": { + "--sample-name": "sample_name", + "--genome": "genome", + "--input": "read1", + "--input2": "read2", + "--single-or-paired": "read_type" + }, + "optional_arguments": { + "--frip-ref-peaks": "FRIP_ref", + "--prealignments": "prealignments", + "--genome-size": "macs_genome_size" + } + } + + + +@pytest.fixture(scope="function") +def piface_config_bundles(request, resources): + """ + Provide the ATAC-Seq pipeline interface as a fixture, including resources. + + Note that this represents the configuration data for the interface for a + single pipeline. In order to use this in the form that a PipelineInterface + expects, this needs to be the value to which a key is mapped within a + larger Mapping. + + :param pytest._pytest.fixtures.SubRequest request: hook into test case + requesting this fixture, which is queried for a resources value with + which to override the default if it's present. + :param Mapping resources: pipeline interface resource specification + :return Iterable[Mapping]: collection of bundles of pipeline interface + configuration bundles + """ + iface_config_datas = request.getfixturevalue("config_bundles") + if isinstance(iface_config_datas, Mapping): + data_bundles = iface_config_datas.values() + elif isinstance(iface_config_datas, Iterable): + data_bundles = iface_config_datas + else: + raise TypeError("Expected mapping or list collection of " + "PipelineInterface data: {} ({})".format( + iface_config_datas, type(iface_config_datas))) + resources = request.getfixturevalue("resources") \ + if "resources" in request.fixturenames else resources + for config_bundle in data_bundles: + config_bundle.update(resources) + return iface_config_datas + + + +class ConstructorPathParsingTests: + """ The constructor is responsible for expanding pipeline path(s). """ + + ADD_PATH = [True, False] + PIPELINE_KEYS = ["ATACSeq.py", "no_path.py"] + RELATIVE_PATH_DATA = [ + ("./arbitrary-test-pipelines", + {}, + "./arbitrary-test-pipelines"), + ("path/to/$TEMP_PIPE_LOCS", + {"TEMP_PIPE_LOCS": "validation-value"}, + "path/to/validation-value")] + ABSOLUTE_PATHS = [os.path.join("~", "code_home", "bioinformatics"), + os.path.join("$TEMP_TEST_HOME", "subfolder"), + os.path.join("~", "$TEMPORARY_SUBFOLDER", "leaf")] + + + @pytest.fixture(scope="function") + def pipe_iface_data(self, piface_config_bundles): + return dict(zip(self.PIPELINE_KEYS, piface_config_bundles)) + + + @pytest.fixture(scope="function", autouse=True) + def apply_envvars(self, request): + """ Use environment variables temporarily. """ + + if "envvars" not in request.fixturenames: + # We're autousing, so check for the relevant fixture. + return + + original_envvars = {} + new_envvars = request.getfixturevalue("envvars") + + # Remember values that are replaced as variables are updated. + for name, value in new_envvars.items(): + try: + original_envvars[name] = os.environ[name] + except KeyError: + pass + os.environ[name] = value + + def restore(): + # Restore swapped variables and delete added ones. + for k, v in new_envvars.items(): + try: + os.environ[k] = original_envvars[k] + except KeyError: + del os.environ[k] + request.addfinalizer(restore) + + + def test_no_path(self, piface_config_bundles, pipe_iface_data): + """ PipelineInterface config sections need not specify path. """ + pi = PipelineInterface(pipe_iface_data) + for pipe_key in self.PIPELINE_KEYS: + piface_config = pi[pipe_key] + # Specific negative test of interest. + assert "path" not in piface_config + # Positive control validation. + assert pipe_iface_data[pipe_key] == piface_config + + + @pytest.mark.parametrize( + argnames=["pipe_path", "envvars", "expected"], + argvalues=RELATIVE_PATH_DATA) + def test_relative_path(self, piface_config_bundles, pipe_iface_data, + pipe_path, envvars, expected, apply_envvars): + """ + PipelineInterface construction expands pipeline path. + + Environment variable(s) expand(s), but the path remains relative + if specified as such, deferring the joining with pipelines location + until used. + + """ + for add_path, pipe_key in zip(self.ADD_PATH, self.PIPELINE_KEYS): + if add_path: + pipe_iface_data[pipe_key]["path"] = pipe_path + pi = PipelineInterface(pipe_iface_data) + for add_path, pipe_key in zip(self.ADD_PATH, self.PIPELINE_KEYS): + if add_path: + assert expected == pi[pipe_key]["path"] + else: + assert "path" not in pi[pipe_key] + + + @pytest.mark.skip("Not implemented") + def test_path_expansion(self, piface_config_bundles, pipe_iface_data): + pass + + + @pytest.mark.skip("Not implemented") class PipelineInterfaceArgstringTests: """ """ diff --git a/tests/models/independent/test_ProtocolInterface.py b/tests/models/independent/test_ProtocolInterface.py new file mode 100644 index 00000000..cf505f81 --- /dev/null +++ b/tests/models/independent/test_ProtocolInterface.py @@ -0,0 +1,17 @@ +""" Tests for ProtocolInterface, for Project/PipelineInterface interaction. """ + +import pytest +from looper.models import ProtocolInterface + + +__author__ = "Vince Reuter" +__email__ = "vreuter@virginia.edu" + + + +class PipelinePathResolutionTests: + """ Project requests pipeline information via an interface key. """ + + + def test_pipeline_interface_path(self): + pass diff --git a/tests/models/test_models_smoke.py b/tests/models/test_models_smoke.py index 9d3aefdb..2f55b1e1 100644 --- a/tests/models/test_models_smoke.py +++ b/tests/models/test_models_smoke.py @@ -61,6 +61,7 @@ def test_repr_smoke(self, class_name, method_name): assert str is type(getattr(class_name, method_name).__call__()) + @pytest.mark.usefixtures("write_project_files") class AttributeDictRepresentationSmokeTests: """ Non-fail validation of AttributeDict representations. """ From c1ceca99dcfdd84b2537708e2badf4f9970a6536 Mon Sep 17 00:00:00 2001 From: Vince Reuter Date: Fri, 23 Jun 2017 23:11:33 -0400 Subject: [PATCH 73/94] reorganize to facilitate fixture reuse --- tests/models/independent/conftest.py | 99 +++++++++++++++++++ .../independent/test_PipelineInterface.py | 75 -------------- .../independent/test_ProtocolInterface.py | 22 ++++- 3 files changed, 120 insertions(+), 76 deletions(-) create mode 100644 tests/models/independent/conftest.py diff --git a/tests/models/independent/conftest.py b/tests/models/independent/conftest.py new file mode 100644 index 00000000..5a671161 --- /dev/null +++ b/tests/models/independent/conftest.py @@ -0,0 +1,99 @@ +""" Configuration for modules with independent tests of models. """ + +import sys +if sys.version_info < (3, 3): + from collections import Iterable, Mapping +else: + from collections.abc import Iterable, Mapping +import pytest + + +__author__ = "Vince Reuter" +__email__ = "vreuter@virginia.edu" + + + +def pytest_generate_tests(metafunc): + """ Conditional customization of test cases in this directory. """ + try: + classname = metafunc.cls.__name__ + except AttributeError: + # Some functions don't belong to a class. + pass + else: + if classname in ["ConstructorPathParsingTests", + "PipelinePathResolutionTests"]: + # Provide test case with two PipelineInterface config bundles. + metafunc.parametrize( + argnames="piface_config_bundles", + argvalues=[(atacseq_iface_without_resources(), + {"name": "sans-path"})]) + + + +@pytest.fixture(scope="function") +def atacseq_iface_without_resources(): + """ + Provide the ATAC-Seq pipeline interface as a fixture, without resources. + + Note that this represents the configuration data for the interface for a + single pipeline. In order to use this in the form that a PipelineInterface + expects, this needs to be the value to which a key is mapped within a + larger Mapping. + + :return Mapping: all of the pipeline interface configuration data for + ATAC-Seq, minus the resources section + """ + return { + "name": "ATACseq", + "looper_args": True, + "required_input_files": ["read1", "read2"], + "all_input_files": ["read1", "read2"], + "ngs_input_files": ["read1", "read2"], + "arguments": { + "--sample-name": "sample_name", + "--genome": "genome", + "--input": "read1", + "--input2": "read2", + "--single-or-paired": "read_type" + }, + "optional_arguments": { + "--frip-ref-peaks": "FRIP_ref", + "--prealignments": "prealignments", + "--genome-size": "macs_genome_size" + } + } + + + +@pytest.fixture(scope="function") +def piface_config_bundles(request, resources): + """ + Provide the ATAC-Seq pipeline interface as a fixture, including resources. + + Note that this represents the configuration data for the interface for a + single pipeline. In order to use this in the form that a PipelineInterface + expects, this needs to be the value to which a key is mapped within a + larger Mapping. + + :param pytest._pytest.fixtures.SubRequest request: hook into test case + requesting this fixture, which is queried for a resources value with + which to override the default if it's present. + :param Mapping resources: pipeline interface resource specification + :return Iterable[Mapping]: collection of bundles of pipeline interface + configuration bundles + """ + iface_config_datas = request.getfixturevalue("config_bundles") + if isinstance(iface_config_datas, Mapping): + data_bundles = iface_config_datas.values() + elif isinstance(iface_config_datas, Iterable): + data_bundles = iface_config_datas + else: + raise TypeError("Expected mapping or list collection of " + "PipelineInterface data: {} ({})".format( + iface_config_datas, type(iface_config_datas))) + resources = request.getfixturevalue("resources") \ + if "resources" in request.fixturenames else resources + for config_bundle in data_bundles: + config_bundle.update(resources) + return iface_config_datas diff --git a/tests/models/independent/test_PipelineInterface.py b/tests/models/independent/test_PipelineInterface.py index 3368f9dc..93b72162 100644 --- a/tests/models/independent/test_PipelineInterface.py +++ b/tests/models/independent/test_PipelineInterface.py @@ -54,12 +54,6 @@ def pytest_generate_tests(metafunc): else: for name, values in parameters.items(): metafunc.parametrize(argnames=name, argvalues=values) - if metafunc.cls == ConstructorPathParsingTests: - # Provide test case with two PipelineInterface config bundles. - metafunc.parametrize( - argnames="piface_config_bundles", - argvalues=[(atacseq_iface_without_resources(), - {"name": "sans-path"})]) @@ -361,75 +355,6 @@ def test_file_size_spec_required_for_non_default_packages( -@pytest.fixture(scope="function") -def atacseq_iface_without_resources(): - """ - Provide the ATAC-Seq pipeline interface as a fixture, without resources. - - Note that this represents the configuration data for the interface for a - single pipeline. In order to use this in the form that a PipelineInterface - expects, this needs to be the value to which a key is mapped within a - larger Mapping. - - :return Mapping: all of the pipeline interface configuration data for - ATAC-Seq, minus the resources section - """ - return { - "name": "ATACseq", - "looper_args": True, - "required_input_files": ["read1", "read2"], - "all_input_files": ["read1", "read2"], - "ngs_input_files": ["read1", "read2"], - "arguments": { - "--sample-name": "sample_name", - "--genome": "genome", - "--input": "read1", - "--input2": "read2", - "--single-or-paired": "read_type" - }, - "optional_arguments": { - "--frip-ref-peaks": "FRIP_ref", - "--prealignments": "prealignments", - "--genome-size": "macs_genome_size" - } - } - - - -@pytest.fixture(scope="function") -def piface_config_bundles(request, resources): - """ - Provide the ATAC-Seq pipeline interface as a fixture, including resources. - - Note that this represents the configuration data for the interface for a - single pipeline. In order to use this in the form that a PipelineInterface - expects, this needs to be the value to which a key is mapped within a - larger Mapping. - - :param pytest._pytest.fixtures.SubRequest request: hook into test case - requesting this fixture, which is queried for a resources value with - which to override the default if it's present. - :param Mapping resources: pipeline interface resource specification - :return Iterable[Mapping]: collection of bundles of pipeline interface - configuration bundles - """ - iface_config_datas = request.getfixturevalue("config_bundles") - if isinstance(iface_config_datas, Mapping): - data_bundles = iface_config_datas.values() - elif isinstance(iface_config_datas, Iterable): - data_bundles = iface_config_datas - else: - raise TypeError("Expected mapping or list collection of " - "PipelineInterface data: {} ({})".format( - iface_config_datas, type(iface_config_datas))) - resources = request.getfixturevalue("resources") \ - if "resources" in request.fixturenames else resources - for config_bundle in data_bundles: - config_bundle.update(resources) - return iface_config_datas - - - class ConstructorPathParsingTests: """ The constructor is responsible for expanding pipeline path(s). """ diff --git a/tests/models/independent/test_ProtocolInterface.py b/tests/models/independent/test_ProtocolInterface.py index cf505f81..0b6ee67f 100644 --- a/tests/models/independent/test_ProtocolInterface.py +++ b/tests/models/independent/test_ProtocolInterface.py @@ -9,9 +9,29 @@ +@pytest.mark.skip("Not implemented") class PipelinePathResolutionTests: """ Project requests pipeline information via an interface key. """ - def test_pipeline_interface_path(self): + def test_no_path(self, piface_config_bundles): pass + + + def test_relative_path(self, piface_config_bundles): + pass + + + def test_absolute_path(self, piface_config_bundles): + pass + + + def test_pipeline_interface_path(self, piface_config_bundles): + pass + + + +@pytest.mark.skip("Not implemented") +class ProtocolInterfacePipelineSampleSubtypeTests: + """ ProtocolInterface attempts import of pipeline-specific Sample. """ + pass From 86deccd93205dc45f143b93a03abd75601c05c24 Mon Sep 17 00:00:00 2001 From: Vince Reuter Date: Sat, 24 Jun 2017 00:06:35 -0400 Subject: [PATCH 74/94] first ProtocolInterface test implemented --- tests/models/independent/conftest.py | 60 ++++++++++++++++++- .../independent/test_PipelineInterface.py | 53 ++++++---------- .../independent/test_ProtocolInterface.py | 45 ++++++++++++-- 3 files changed, 114 insertions(+), 44 deletions(-) diff --git a/tests/models/independent/conftest.py b/tests/models/independent/conftest.py index 5a671161..85dad34c 100644 --- a/tests/models/independent/conftest.py +++ b/tests/models/independent/conftest.py @@ -1,11 +1,13 @@ """ Configuration for modules with independent tests of models. """ +import copy import sys if sys.version_info < (3, 3): from collections import Iterable, Mapping else: from collections.abc import Iterable, Mapping import pytest +from looper.models import DEFAULT_COMPUTE_RESOURCES_NAME __author__ = "Vince Reuter" @@ -13,6 +15,16 @@ +# Compute resource bundles for pipeline interface configuration data +DEFAULT_RESOURCES = {"file_size": 0, "cores": 1, "mem": 8000, + "time": "0-01:00:00", "partition": "local"} +MIDSIZE_RESOURCES = {"file_size": 10, "cores": 8, "mem": 16000, + "time": "0-07:00:00", "partition": "serial"} +HUGE_RESOURCES = {"file_size": 30, "cores": 24, "mem": 64000, + "time": "30-00:00:00", "partition": "longq"} + + + def pytest_generate_tests(metafunc): """ Conditional customization of test cases in this directory. """ try: @@ -21,11 +33,10 @@ def pytest_generate_tests(metafunc): # Some functions don't belong to a class. pass else: - if classname in ["ConstructorPathParsingTests", - "PipelinePathResolutionTests"]: + if classname == "ConstructorPathParsingTests": # Provide test case with two PipelineInterface config bundles. metafunc.parametrize( - argnames="piface_config_bundles", + argnames="config_bundles", argvalues=[(atacseq_iface_without_resources(), {"name": "sans-path"})]) @@ -66,6 +77,41 @@ def atacseq_iface_without_resources(): +@pytest.fixture(scope="function") +def atacseq_piface_data(atacseq_iface_without_resources, resources): + """ + Provide a test case with data for an ATACSeq PipelineInterface. + + :param dict atacseq_iface_without_resources: PipelineInterface config + data, minus a resources section + :param Mapping resources: resources section of PipelineInterface + configuration data + :return dict: configuration data needed to create PipelineInterface + """ + piface = copy.deepcopy(atacseq_iface_without_resources) + piface.update(resources) + return {"ATACSeq.py": piface} + + + +@pytest.fixture(scope="function") +def default_resources(): + return copy.deepcopy(DEFAULT_RESOURCES) + + + +@pytest.fixture(scope="function") +def huge_resources(): + return copy.deepcopy(HUGE_RESOURCES) + + + +@pytest.fixture(scope="function") +def midsize_resources(): + return copy.deepcopy(MIDSIZE_RESOURCES) + + + @pytest.fixture(scope="function") def piface_config_bundles(request, resources): """ @@ -97,3 +143,11 @@ def piface_config_bundles(request, resources): for config_bundle in data_bundles: config_bundle.update(resources) return iface_config_datas + + + +@pytest.fixture(scope="function") +def resources(): + """ Basic PipelineInterface compute resources data. """ + return {DEFAULT_COMPUTE_RESOURCES_NAME: copy.deepcopy(DEFAULT_RESOURCES), + "huge": copy.copy(HUGE_RESOURCES)} diff --git a/tests/models/independent/test_PipelineInterface.py b/tests/models/independent/test_PipelineInterface.py index 93b72162..781cb2c9 100644 --- a/tests/models/independent/test_PipelineInterface.py +++ b/tests/models/independent/test_PipelineInterface.py @@ -6,11 +6,6 @@ import logging import os import random -import sys -if sys.version_info < (3, 3): - from collections import Iterable, Mapping -else: - from collections.abc import Iterable, Mapping import mock import pytest @@ -32,15 +27,6 @@ PIPELINE_NAMES = ["ATACseq", "WGBS"] EXTENSIONS = [".py", ".sh", ".R"] -# Compute resource bundles for pipeline interface configuration data -DEFAULT_RESOURCES = {"file_size": 0, "cores": 1, "mem": 8000, - "time": "0-01:00:00", "partition": "local"} -MIDSIZE_RESOURCES = {"file_size": 10, "cores": 8, "mem": 16000, - "time": "0-07:00:00", "partition": "serial"} -HUGE_RESOURCES = {"file_size": 30, "cores": 24, "mem": 64000, - "time": "30-00:00:00", "partition": "longq"} -HUGE_RESOURCES_NAME = "huge" - def pytest_generate_tests(metafunc): @@ -67,14 +53,6 @@ def basic_pipe_iface_data(request): -@pytest.fixture(scope="function") -def resources(): - """ Basic PipelineInterface compute resources data. """ - return {DEFAULT_COMPUTE_RESOURCES_NAME: copy.deepcopy(DEFAULT_RESOURCES), - "huge": copy.copy(HUGE_RESOURCES)} - - - @pytest.fixture(scope="function") def pi_with_resources(request, basic_pipe_iface_data, resources): """ Add resource bundle data to each config section. """ @@ -185,7 +163,7 @@ class PipelineInterfaceResourcePackageTests: def test_requires_default( - self, use_new_file_size, pi_with_resources): + self, use_new_file_size, pi_with_resources, huge_resources): """ If provided, resources specification needs 'default.' """ pi = pi_with_resources for name, pipeline in pi: @@ -197,7 +175,7 @@ def test_requires_default( assert "default" not in pipeline["resources"] with pytest.raises(_InvalidResourceSpecificationException): pi.choose_resource_package( - name, file_size=HUGE_RESOURCES["file_size"] + 1) + name, file_size=huge_resources["file_size"] + 1) def test_negative_file_size_request( @@ -229,11 +207,11 @@ def test_resources_not_required( (16, "midsize"), (64, "huge")]) def test_selects_proper_resource_package( self, use_new_file_size, pi_with_resources, - file_size, expected_package_name): + file_size, expected_package_name, midsize_resources): """ Minimal resource package sufficient for pipeline and file size. """ for pipe_data in pi_with_resources.pipelines: pipe_data["resources"].update( - {"midsize": copy.deepcopy(MIDSIZE_RESOURCES)}) + {"midsize": copy.deepcopy(midsize_resources)}) for pipe_name, pipe_data in pi_with_resources: observed_package = pi_with_resources.choose_resource_package( pipe_name, file_size) @@ -257,7 +235,8 @@ def test_negative_file_size_prohibited( def test_file_size_spec_not_required_for_default( - self, use_new_file_size, basic_pipe_iface_data): + self, use_new_file_size, basic_pipe_iface_data, + default_resources, huge_resources, midsize_resources): """ Default package implies minimum file size of zero. """ def clear_file_size(resource_package): @@ -269,7 +248,7 @@ def clear_file_size(resource_package): resources_data = dict(zip( ["default", "midsize", "huge"], [copy.deepcopy(data) for data in - [DEFAULT_RESOURCES, MIDSIZE_RESOURCES, HUGE_RESOURCES]])) + [default_resources, midsize_resources, huge_resources]])) for pack_name, pack_data in resources_data.items(): # Use file size spec name as appropriate; clean default package. if pack_name == "default": @@ -326,13 +305,14 @@ def test_default_package_new_name_zero_size( def test_file_size_spec_required_for_non_default_packages( - self, use_new_file_size, basic_pipe_iface_data): + self, use_new_file_size, basic_pipe_iface_data, + default_resources, huge_resources): """ Resource packages besides default require file size. """ # Establish the resource specification. resource_package_data = { - "default": copy.deepcopy(DEFAULT_RESOURCES), - "huge": copy.deepcopy(HUGE_RESOURCES)} + "default": copy.deepcopy(default_resources), + "huge": copy.deepcopy(huge_resources)} # Remove file size for non-default; set it for default. del resource_package_data["huge"]["file_size"] @@ -406,7 +386,8 @@ def restore(): request.addfinalizer(restore) - def test_no_path(self, piface_config_bundles, pipe_iface_data): + def test_no_path(self, config_bundles, piface_config_bundles, + pipe_iface_data): """ PipelineInterface config sections need not specify path. """ pi = PipelineInterface(pipe_iface_data) for pipe_key in self.PIPELINE_KEYS: @@ -420,8 +401,9 @@ def test_no_path(self, piface_config_bundles, pipe_iface_data): @pytest.mark.parametrize( argnames=["pipe_path", "envvars", "expected"], argvalues=RELATIVE_PATH_DATA) - def test_relative_path(self, piface_config_bundles, pipe_iface_data, - pipe_path, envvars, expected, apply_envvars): + def test_relative_path( + self, config_bundles, piface_config_bundles, pipe_iface_data, + pipe_path, envvars, expected, apply_envvars): """ PipelineInterface construction expands pipeline path. @@ -442,7 +424,8 @@ def test_relative_path(self, piface_config_bundles, pipe_iface_data, @pytest.mark.skip("Not implemented") - def test_path_expansion(self, piface_config_bundles, pipe_iface_data): + def test_path_expansion(self, config_bundles, piface_config_bundles, + pipe_iface_data): pass diff --git a/tests/models/independent/test_ProtocolInterface.py b/tests/models/independent/test_ProtocolInterface.py index 0b6ee67f..c05b4323 100644 --- a/tests/models/independent/test_ProtocolInterface.py +++ b/tests/models/independent/test_ProtocolInterface.py @@ -1,6 +1,8 @@ """ Tests for ProtocolInterface, for Project/PipelineInterface interaction. """ +import os import pytest +import yaml from looper.models import ProtocolInterface @@ -9,24 +11,55 @@ -@pytest.mark.skip("Not implemented") +def _write_config_data(conf_data, dirpath): + """ + + :param conf_data: + :param dirpath: + :return: + """ + filepath = os.path.join(dirpath, "pipeline_interface.yaml") + with open(filepath, 'w') as conf_file: + yaml.safe_dump(conf_data, conf_file) + return filepath + + + +@pytest.fixture(scope="function") +def path_config_file(request, tmpdir): + conf_data = request.getfixturevalue("atacseq_piface_data") + full_conf_data = {"protocol_mapping": {"ATAC": "ATACSeq.py"}, + "pipelines": conf_data} + return _write_config_data(full_conf_data, dirpath=tmpdir.strpath) + + + class PipelinePathResolutionTests: """ Project requests pipeline information via an interface key. """ + PIPELINE_KEY = "ATACSeq.py" - def test_no_path(self, piface_config_bundles): - pass + def test_no_path(self, atacseq_piface_data, path_config_file): + proto_iface = ProtocolInterface(path_config_file) + config_dirpath = os.path.dirname(path_config_file) + expected_pipe_path = os.path.join(config_dirpath, self.PIPELINE_KEY) + _, full_pipe_path, _ = \ + proto_iface.pipeline_key_to_path(self.PIPELINE_KEY) + assert expected_pipe_path == full_pipe_path - def test_relative_path(self, piface_config_bundles): + @pytest.mark.skip("Not implemented") + def test_relative_path(self, piface_config): pass - def test_absolute_path(self, piface_config_bundles): + @pytest.mark.skip("Not implemented") + def test_absolute_path(self, piface_config): pass - def test_pipeline_interface_path(self, piface_config_bundles): + @pytest.mark.skip("Not implemented") + def test_pipeline_interface_path(self, piface_config): pass From eb7ac19261b0b1e82538f752898de9d7b3f3b981 Mon Sep 17 00:00:00 2001 From: Vince Reuter Date: Sat, 24 Jun 2017 17:15:31 -0400 Subject: [PATCH 75/94] better function name; more tests and expose lack of leading-dot relative path handling --- looper/models.py | 14 ++- tests/models/independent/conftest.py | 13 ++- .../independent/test_PipelineInterface.py | 4 +- .../independent/test_ProtocolInterface.py | 91 ++++++++++++++++--- 4 files changed, 99 insertions(+), 23 deletions(-) diff --git a/looper/models.py b/looper/models.py index 65201a36..336498e2 100644 --- a/looper/models.py +++ b/looper/models.py @@ -2464,7 +2464,7 @@ def create_submission_bundle(self, pipeline_key, protocol): subtype = None strict_pipe_key, full_pipe_path, full_pipe_path_with_flags = \ - self.pipeline_key_to_path(pipeline_key) + self.finalize_pipeline_key_and_paths(pipeline_key) this_pipeline_data = self.pipe_iface[strict_pipe_key] try: @@ -2539,10 +2539,15 @@ def _parse_iface_data(cls, pipe_iface_data): return attribute_values - def pipeline_key_to_path(self, pipeline_key): + def finalize_pipeline_key_and_paths(self, pipeline_key): """ - Given a pipeline_key, return the path to the script for that pipeline - specified in this pipeline interface config file. + Determine pipeline's full path, arguments, and strict key. + + This handles multiple ways in which to refer to a pipeline (by key) + within the mapping that contains the data that defines a + PipelineInterface. It also ensures proper handling of the path to the + pipeline (i.e., ensuring that it's absolute), and that the text for + the arguments are appropriately dealt parsed and passed. :param str pipeline_key: the key in the pipeline interface file used for the protocol_mappings section. Previously was the script name. @@ -2573,6 +2578,7 @@ def pipeline_key_to_path(self, pipeline_key): if not _os.path.exists(script_path_only): _LOGGER.warn( "Missing script command: '{}'".format(script_path_only)) + return strict_pipeline_key, script_path_only, script_path_with_flags diff --git a/tests/models/independent/conftest.py b/tests/models/independent/conftest.py index 85dad34c..bd39d38a 100644 --- a/tests/models/independent/conftest.py +++ b/tests/models/independent/conftest.py @@ -78,7 +78,14 @@ def atacseq_iface_without_resources(): @pytest.fixture(scope="function") -def atacseq_piface_data(atacseq_iface_without_resources, resources): +def atac_pipe_name(): + return "ATACSeq.py" + + + +@pytest.fixture(scope="function") +def atacseq_piface_data(atacseq_iface_without_resources, + resources, atac_pipe_name): """ Provide a test case with data for an ATACSeq PipelineInterface. @@ -86,11 +93,13 @@ def atacseq_piface_data(atacseq_iface_without_resources, resources): data, minus a resources section :param Mapping resources: resources section of PipelineInterface configuration data + :param str atac_pipe_name: name/key for the pipeline to which the + interface data pertains :return dict: configuration data needed to create PipelineInterface """ piface = copy.deepcopy(atacseq_iface_without_resources) piface.update(resources) - return {"ATACSeq.py": piface} + return {atac_pipe_name: piface} diff --git a/tests/models/independent/test_PipelineInterface.py b/tests/models/independent/test_PipelineInterface.py index 781cb2c9..90cf830a 100644 --- a/tests/models/independent/test_PipelineInterface.py +++ b/tests/models/independent/test_PipelineInterface.py @@ -408,8 +408,8 @@ def test_relative_path( PipelineInterface construction expands pipeline path. Environment variable(s) expand(s), but the path remains relative - if specified as such, deferring the joining with pipelines location - until used. + if specified as such, deferring the joining with pipelines location, + which makes the path absolute, until the path is actually used. """ for add_path, pipe_key in zip(self.ADD_PATH, self.PIPELINE_KEYS): diff --git a/tests/models/independent/test_ProtocolInterface.py b/tests/models/independent/test_ProtocolInterface.py index c05b4323..3ef80946 100644 --- a/tests/models/independent/test_ProtocolInterface.py +++ b/tests/models/independent/test_ProtocolInterface.py @@ -11,46 +11,107 @@ -def _write_config_data(conf_data, dirpath): +def _write_config_data(protomap, conf_data, dirpath): """ - + + :param protomap :param conf_data: :param dirpath: :return: """ + full_conf_data = {"protocol_mapping": protomap, "pipelines": conf_data} filepath = os.path.join(dirpath, "pipeline_interface.yaml") with open(filepath, 'w') as conf_file: - yaml.safe_dump(conf_data, conf_file) + yaml.safe_dump(full_conf_data, conf_file) return filepath @pytest.fixture(scope="function") -def path_config_file(request, tmpdir): +def path_config_file(request, tmpdir, atac_pipe_name): + """ + Write PipelineInterface configuration data to disk. + + Grab the data from the test case's appropriate fixture. Also check the + test case parameterization for pipeline path specification, adding it to + the configuration data before writing to disk if the path specification is + present + + :param pytest._pytest.fixtures.SubRequest request: test case requesting + this fixture + :param py.path.local.LocalPath tmpdir: temporary directory fixture + :param str atac_pipe_name: name/key for ATAC-Seq pipeline; this should + also be used by the requesting test case if a path is to be added; + separating the name from the folder path allows parameterization of + the test case in terms of folder path, with pipeline name appended + after the fact (that is, the name fixture can't be used in the ) + :return str: path to the configuration file written + """ conf_data = request.getfixturevalue("atacseq_piface_data") - full_conf_data = {"protocol_mapping": {"ATAC": "ATACSeq.py"}, - "pipelines": conf_data} - return _write_config_data(full_conf_data, dirpath=tmpdir.strpath) + if "pipe_path" in request.fixturenames: + pipeline_dirpath = request.getfixturevalue("pipe_path") + pipe_path = os.path.join(pipeline_dirpath, atac_pipe_name) + # Pipeline key/name is mapped to the interface data; insert path in + # that Mapping, not at the top level, in which name/key is mapped to + # interface data bundle. + for iface_bundle in conf_data.values(): + iface_bundle["path"] = pipe_path + return _write_config_data(protomap={"ATAC": atac_pipe_name}, + conf_data=conf_data, dirpath=tmpdir.strpath) class PipelinePathResolutionTests: """ Project requests pipeline information via an interface key. """ - PIPELINE_KEY = "ATACSeq.py" - def test_no_path(self, atacseq_piface_data, path_config_file): - proto_iface = ProtocolInterface(path_config_file) + def test_no_path(self, atacseq_piface_data, + path_config_file, atac_pipe_name): + """ Without explicit path, pipeline is assumed parallel to config. """ + + piface = ProtocolInterface(path_config_file) + + # The pipeline is assumed to live alongside its configuration file. config_dirpath = os.path.dirname(path_config_file) - expected_pipe_path = os.path.join(config_dirpath, self.PIPELINE_KEY) + expected_pipe_path = os.path.join(config_dirpath, atac_pipe_name) + _, full_pipe_path, _ = \ - proto_iface.pipeline_key_to_path(self.PIPELINE_KEY) + piface.finalize_pipeline_key_and_paths(atac_pipe_name) assert expected_pipe_path == full_pipe_path - @pytest.mark.skip("Not implemented") - def test_relative_path(self, piface_config): - pass + def test_relpath_drops_dot_and_becomes_absolute( + self, tmpdir, atac_pipe_name, atacseq_piface_data): + """ Leading dot drops from relative path, and it's made absolute. """ + path_parts = ["relpath", "to", "pipelines"] + sans_dot_path = os.path.join(*path_parts) + pipe_path = os.path.join(".", sans_dot_path) + atacseq_piface_data[atac_pipe_name]["path"] = pipe_path + + exp_path = os.path.join(tmpdir.strpath, sans_dot_path, atac_pipe_name) + + path_config_file = _write_config_data( + protomap={"ATAC": atac_pipe_name}, + conf_data=atacseq_piface_data, dirpath=tmpdir.strpath) + piface = ProtocolInterface(path_config_file) + _, obs_path, _ = piface.finalize_pipeline_key_and_paths(atac_pipe_name) + assert exp_path == obs_path + + + @pytest.mark.parametrize( + argnames="pipe_path", argvalues=["relative/pipelines/path"]) + def test_non_dot_relpath_becomes_absolute( + self, atacseq_piface_data, path_config_file, + tmpdir, pipe_path, atac_pipe_name): + """ Relative pipeline path is made absolute when requested by key. """ + # TODO: constant-ify "path" and "ATACSeq.py", as well as possibly "pipelines" + # and "protocol_mapping" section names of PipelineInterface + exp_pipe_path = os.path.join( + tmpdir.strpath, pipe_path, atac_pipe_name) + piface = ProtocolInterface(path_config_file) + _, obs_pipe_path, _ = piface.finalize_pipeline_key_and_paths( + atac_pipe_name) + assert exp_pipe_path == obs_pipe_path @pytest.mark.skip("Not implemented") From 89c5895a6a2142ce1feedb2e265b36c755ea40e9 Mon Sep 17 00:00:00 2001 From: Vince Reuter Date: Sat, 24 Jun 2017 23:36:18 -0400 Subject: [PATCH 76/94] little more info about the path expansion should it ever fail; fix up the test expectation to deal with the lingering relative path stuff --- looper/models.py | 7 ++++++- tests/models/independent/test_ProtocolInterface.py | 9 +++++---- 2 files changed, 11 insertions(+), 5 deletions(-) diff --git a/looper/models.py b/looper/models.py index 336498e2..c371a257 100644 --- a/looper/models.py +++ b/looper/models.py @@ -2571,13 +2571,18 @@ def finalize_pipeline_key_and_paths(self, pipeline_key): script_path_with_flags = pipeline_key if not _os.path.isabs(script_path_only): + _LOGGER.log(5, "Expanding non-absolute script path: '%s'", + script_path_only) script_path_only = _os.path.join( self.pipelines_path, script_path_only) + _LOGGER.log(5, "Absolute script path: '%s'", script_path_only) script_path_with_flags = _os.path.join( self.pipelines_path, script_path_with_flags) + _LOGGER.log(5, "Absolute script path with flags: '%s'", + script_path_with_flags) if not _os.path.exists(script_path_only): _LOGGER.warn( - "Missing script command: '{}'".format(script_path_only)) + "Missing pipeline script: '%s'", script_path_only) return strict_pipeline_key, script_path_only, script_path_with_flags diff --git a/tests/models/independent/test_ProtocolInterface.py b/tests/models/independent/test_ProtocolInterface.py index 3ef80946..0d9d72ce 100644 --- a/tests/models/independent/test_ProtocolInterface.py +++ b/tests/models/independent/test_ProtocolInterface.py @@ -80,22 +80,23 @@ def test_no_path(self, atacseq_piface_data, assert expected_pipe_path == full_pipe_path - def test_relpath_drops_dot_and_becomes_absolute( + def test_relpath_with_dot_becomes_absolute( self, tmpdir, atac_pipe_name, atacseq_piface_data): """ Leading dot drops from relative path, and it's made absolute. """ - path_parts = ["relpath", "to", "pipelines"] + path_parts = ["relpath", "to", "pipelines", atac_pipe_name] sans_dot_path = os.path.join(*path_parts) pipe_path = os.path.join(".", sans_dot_path) atacseq_piface_data[atac_pipe_name]["path"] = pipe_path - exp_path = os.path.join(tmpdir.strpath, sans_dot_path, atac_pipe_name) + exp_path = os.path.join(tmpdir.strpath, sans_dot_path) path_config_file = _write_config_data( protomap={"ATAC": atac_pipe_name}, conf_data=atacseq_piface_data, dirpath=tmpdir.strpath) piface = ProtocolInterface(path_config_file) _, obs_path, _ = piface.finalize_pipeline_key_and_paths(atac_pipe_name) - assert exp_path == obs_path + # Dot may remain in path, so assert equality of absolute paths. + assert os.path.abspath(exp_path) == os.path.abspath(obs_path) @pytest.mark.parametrize( From db3cf18af10d12f5552eb7deac4a91f8dd5ccc51 Mon Sep 17 00:00:00 2001 From: Vince Reuter Date: Sat, 24 Jun 2017 23:49:18 -0400 Subject: [PATCH 77/94] finish protocol interface pipeline path tests, passing --- .../independent/test_ProtocolInterface.py | 32 ++++++++++++------- 1 file changed, 20 insertions(+), 12 deletions(-) diff --git a/tests/models/independent/test_ProtocolInterface.py b/tests/models/independent/test_ProtocolInterface.py index 0d9d72ce..3e7e5172 100644 --- a/tests/models/independent/test_ProtocolInterface.py +++ b/tests/models/independent/test_ProtocolInterface.py @@ -107,22 +107,30 @@ def test_non_dot_relpath_becomes_absolute( """ Relative pipeline path is made absolute when requested by key. """ # TODO: constant-ify "path" and "ATACSeq.py", as well as possibly "pipelines" # and "protocol_mapping" section names of PipelineInterface - exp_pipe_path = os.path.join( + exp_path = os.path.join( tmpdir.strpath, pipe_path, atac_pipe_name) piface = ProtocolInterface(path_config_file) - _, obs_pipe_path, _ = piface.finalize_pipeline_key_and_paths( - atac_pipe_name) - assert exp_pipe_path == obs_pipe_path - - - @pytest.mark.skip("Not implemented") - def test_absolute_path(self, piface_config): - pass + _, obs_path, _ = piface.finalize_pipeline_key_and_paths(atac_pipe_name) + assert exp_path == obs_path - @pytest.mark.skip("Not implemented") - def test_pipeline_interface_path(self, piface_config): - pass + @pytest.mark.parametrize( + argnames=["pipe_path", "expected_path_base"], + argvalues=[(os.path.join("$HOME", "code-base-home", "biopipes"), + os.path.join(os.path.expandvars("$HOME"), + "code-base-home", "biopipes")), + (os.path.join("~", "bioinformatics-pipelines"), + os.path.join(os.path.expanduser("~"), + "bioinformatics-pipelines"))]) + def test_absolute_path( + self, atacseq_piface_data, path_config_file, tmpdir, pipe_path, + expected_path_base, atac_pipe_name): + """ Absolute path regardless of variables works as pipeline path. """ + exp_path = os.path.join( + tmpdir.strpath, expected_path_base, atac_pipe_name) + piface = ProtocolInterface(path_config_file) + _, obs_path, _ = piface.finalize_pipeline_key_and_paths(atac_pipe_name) + assert exp_path == obs_path From fae665624216bf3a9706d730bb0e9c059d096afa Mon Sep 17 00:00:00 2001 From: Vince Reuter Date: Sun, 25 Jun 2017 00:07:28 -0400 Subject: [PATCH 78/94] add path expansion tests for PipelineInterface, passing --- .../independent/test_PipelineInterface.py | 32 +++++++++++++++---- .../independent/test_ProtocolInterface.py | 6 ++++ 2 files changed, 31 insertions(+), 7 deletions(-) diff --git a/tests/models/independent/test_PipelineInterface.py b/tests/models/independent/test_PipelineInterface.py index 90cf830a..c23350fa 100644 --- a/tests/models/independent/test_PipelineInterface.py +++ b/tests/models/independent/test_PipelineInterface.py @@ -347,9 +347,17 @@ class ConstructorPathParsingTests: ("path/to/$TEMP_PIPE_LOCS", {"TEMP_PIPE_LOCS": "validation-value"}, "path/to/validation-value")] - ABSOLUTE_PATHS = [os.path.join("~", "code_home", "bioinformatics"), - os.path.join("$TEMP_TEST_HOME", "subfolder"), - os.path.join("~", "$TEMPORARY_SUBFOLDER", "leaf")] + ABSOLUTE_PATHS = [ + os.path.join("~", "code_home", "bioinformatics"), + os.path.join("$TEMP_TEST_HOME", "subfolder"), + os.path.join("~", "$TEMPORARY_SUBFOLDER", "leaf")] + ABSPATH_ENVVARS = {"TEMP_TEST_HOME": "tmptest-home-folder", + "TEMPORARY_SUBFOLDER": "temp-subfolder"} + EXPECTED_PATHS_ABSOLUTE = [ + os.path.join(os.path.expanduser("~"), "code_home", + "bioinformatics"), + os.path.join("tmptest-home-folder", "subfolder"), + os.path.join(os.path.expanduser("~"), "temp-subfolder", "leaf")] @pytest.fixture(scope="function") @@ -423,10 +431,20 @@ def test_relative_path( assert "path" not in pi[pipe_key] - @pytest.mark.skip("Not implemented") - def test_path_expansion(self, config_bundles, piface_config_bundles, - pipe_iface_data): - pass + @pytest.mark.parametrize( + argnames=["pipe_path", "envvars", "expected"], + argvalues=zip(ABSOLUTE_PATHS, + len(ABSOLUTE_PATHS) * [ABSPATH_ENVVARS], + EXPECTED_PATHS_ABSOLUTE)) + def test_path_expansion( + self, pipe_path, envvars, expected, + config_bundles, piface_config_bundles, pipe_iface_data): + """ User/environment variables are expanded. """ + for piface_data in pipe_iface_data.values(): + piface_data["path"] = pipe_path + pi = PipelineInterface(pipe_iface_data) + for _, piface_data in pi: + assert expected == piface_data["path"] diff --git a/tests/models/independent/test_ProtocolInterface.py b/tests/models/independent/test_ProtocolInterface.py index 3e7e5172..cc3621fc 100644 --- a/tests/models/independent/test_ProtocolInterface.py +++ b/tests/models/independent/test_ProtocolInterface.py @@ -133,6 +133,12 @@ def test_absolute_path( assert exp_path == obs_path + @pytest.mark.skip("Not implemented") + def test_warns_about_nonexistent_pipeline_script_path(self): + """ Nonexistent, resolved pipeline script path generates warning. """ + pass + + @pytest.mark.skip("Not implemented") class ProtocolInterfacePipelineSampleSubtypeTests: From 8d4d5e614a9f6bad4820e34c27a7df950c14b947 Mon Sep 17 00:00:00 2001 From: Vince Reuter Date: Sun, 25 Jun 2017 00:29:46 -0400 Subject: [PATCH 79/94] add tests to check for warning message about nonexistent pipeline script path, passing --- .../independent/test_ProtocolInterface.py | 30 +++++++++++++++++-- 1 file changed, 27 insertions(+), 3 deletions(-) diff --git a/tests/models/independent/test_ProtocolInterface.py b/tests/models/independent/test_ProtocolInterface.py index cc3621fc..ace63514 100644 --- a/tests/models/independent/test_ProtocolInterface.py +++ b/tests/models/independent/test_ProtocolInterface.py @@ -1,8 +1,10 @@ """ Tests for ProtocolInterface, for Project/PipelineInterface interaction. """ +import logging import os import pytest import yaml +from looper import models, DEV_LOGGING_FMT from looper.models import ProtocolInterface @@ -133,10 +135,32 @@ def test_absolute_path( assert exp_path == obs_path - @pytest.mark.skip("Not implemented") - def test_warns_about_nonexistent_pipeline_script_path(self): + @pytest.mark.xfail( + condition=models._LOGGER.getEffectiveLevel() < logging.WARN, + reason="Insufficient logging level to capture warning message: {}". + format(models._LOGGER.getEffectiveLevel())) + @pytest.mark.parametrize( + argnames="pipe_path", + argvalues=["nonexistent.py", "path/to/missing.py", + "/abs/path/to/mythical"]) + def test_warns_about_nonexistent_pipeline_script_path( + self, atacseq_piface_data, path_config_file, + tmpdir, pipe_path, atac_pipe_name): """ Nonexistent, resolved pipeline script path generates warning. """ - pass + name_log_file = "temp-test-log.txt" + path_log_file = os.path.join(tmpdir.strpath, name_log_file) + temp_hdlr = logging.FileHandler(path_log_file, mode='w') + fmt = logging.Formatter(DEV_LOGGING_FMT) + temp_hdlr.setFormatter(fmt) + temp_hdlr.setLevel(logging.WARN) + models._LOGGER.handlers.append(temp_hdlr) + pi = ProtocolInterface(path_config_file) + pi.finalize_pipeline_key_and_paths(atac_pipe_name) + with open(path_log_file, 'r') as logfile: + loglines = logfile.readlines() + assert 1 == len(loglines) + logmsg = loglines[0] + assert "WARN" in logmsg and pipe_path in logmsg From a95591d11d534aca201640f8e56784b1e6dbdadd Mon Sep 17 00:00:00 2001 From: Vince Reuter Date: Sun, 25 Jun 2017 13:05:39 -0400 Subject: [PATCH 80/94] better separation of concerns in handling of creation of submission bundles and Sample subtyping --- looper/models.py | 45 ++++++++++++------- .../independent/test_ProtocolInterface.py | 13 +++--- 2 files changed, 38 insertions(+), 20 deletions(-) diff --git a/looper/models.py b/looper/models.py index c371a257..71d0bd08 100644 --- a/looper/models.py +++ b/looper/models.py @@ -913,9 +913,26 @@ def build_submission_bundles(self, protocol, priority=True): format(len(new_scripts), protocol, proto_iface.source, new_scripts)) - new_jobs = [proto_iface.create_submission_bundle(pipeline_key, - protocol) - for pipeline_key in new_scripts] + # For each pipeline script to which this protocol will pertain, + # create the new jobs/submission bundles. + new_jobs = [] + for pipeline_key in new_scripts: + # Determine how to reference the pipeline and where it is. + strict_pipe_key, full_pipe_path, full_pipe_path_with_flags = \ + proto_iface.finalize_pipeline_key_and_paths( + pipeline_key) + # Determine which interface and Sample subtype to use. + sample_subtype = \ + proto_iface.fetch_sample_subtype( + protocol, strict_pipe_key, full_pipe_path) + # Package the pipeline's interface, subtype, command, and key. + submission_bundle = SubmissionBundle( + proto_iface.pipe_iface, sample_subtype, + strict_pipe_key, full_pipe_path_with_flags) + # Add this bundle to the collection of ones relevant for the + # current ProtocolInterface. + new_jobs.append(submission_bundle) + job_submission_bundles.append(new_jobs) # Repeat logic check of short-circuit conditional to account for @@ -2448,23 +2465,22 @@ def __repr__(self): return "ProtocolInterface from '{}'".format(self.source or "Mapping") - def create_submission_bundle(self, pipeline_key, protocol): + def fetch_sample_subtype( + self, protocol, strict_pipe_key, full_pipe_path): """ - Create the collection of values needed to submit Sample for processing. + Determine the interface and Sample subtype for a protocol and pipeline. - :param str pipeline_key: key for specific pipeline in a pipeline - interface mapping declaration :param str protocol: name of the relevant protocol - :return SubmissionBundle: a namedtuple with this ProtocolInterface's - PipelineInterface, the Sample subtype to use for the submission, - the pipeline (script) key, and the full pipeline path with - command-line flags + :param str strict_pipe_key: key for specific pipeline in a pipeline + interface mapping declaration + :param str full_pipe_path: (absolute, expanded) path to the + pipeline script + :return type: Sample subtype to use for jobs for the given protocol, + that use the pipeline indicated """ subtype = None - strict_pipe_key, full_pipe_path, full_pipe_path_with_flags = \ - self.finalize_pipeline_key_and_paths(pipeline_key) this_pipeline_data = self.pipe_iface[strict_pipe_key] try: @@ -2507,8 +2523,7 @@ def create_submission_bundle(self, pipeline_key, protocol): subtype = subtype or \ _import_sample_subtype(full_pipe_path, subtype_name) _LOGGER.debug("Using Sample subtype: %s", subtype.__name__) - return SubmissionBundle(self.pipe_iface, subtype, - strict_pipe_key, full_pipe_path_with_flags) + return subtype def fetch(self, protocol): diff --git a/tests/models/independent/test_ProtocolInterface.py b/tests/models/independent/test_ProtocolInterface.py index ace63514..efa46229 100644 --- a/tests/models/independent/test_ProtocolInterface.py +++ b/tests/models/independent/test_ProtocolInterface.py @@ -15,11 +15,14 @@ def _write_config_data(protomap, conf_data, dirpath): """ - - :param protomap - :param conf_data: - :param dirpath: - :return: + Write ProtocolInterface data to (temp)file. + + :param Mapping protomap: mapping from protocol name to pipeline key/name + :param Mapping conf_data: mapping from pipeline key/name to configuration + data for a PipelineInterface + :param str dirpath: path to filesystem location in which to place the + file to write + :return str: path to the (temp)file written """ full_conf_data = {"protocol_mapping": protomap, "pipelines": conf_data} filepath = os.path.join(dirpath, "pipeline_interface.yaml") From 923a4ab3ee3d4d8c1d636f3609a54ebb76dcf5f5 Mon Sep 17 00:00:00 2001 From: Vince Reuter Date: Sun, 25 Jun 2017 22:59:08 -0400 Subject: [PATCH 81/94] catch more exceptions and add explanation; add tests and stub others, passing --- looper/models.py | 7 +- tests/models/independent/conftest.py | 23 +++- .../independent/test_ProtocolInterface.py | 104 +++++++++++++++++- 3 files changed, 124 insertions(+), 10 deletions(-) diff --git a/looper/models.py b/looper/models.py index 71d0bd08..28e3d8c0 100644 --- a/looper/models.py +++ b/looper/models.py @@ -2477,6 +2477,8 @@ def fetch_sample_subtype( pipeline script :return type: Sample subtype to use for jobs for the given protocol, that use the pipeline indicated + :raises KeyError: if given a pipeline key that's not mapped in this + ProtocolInterface instance's PipelineInterface """ subtype = None @@ -2769,11 +2771,14 @@ def _import_sample_subtype(pipeline_filepath, subtype_name=None): pipeline_module = import_from_source( name=modname, module_filepath=pipeline_filepath) except SystemExit: + # SystemExit would be caught as BaseException, but SystemExit is + # particularly suggestive of an a script without a conditional + # check on __main__, and as such warrant a tailored message. _LOGGER.warn("'%s' appears to attempt to run on import; " "does it lack a conditional on __main__? Using base %s", base_type.__name__) return base_type - except Exception as e: + except (BaseException, Exception) as e: _LOGGER.warn("Using base %s because of failure in attempt to " "import pipeline module: %s", base_type.__name__, e) return base_type diff --git a/tests/models/independent/conftest.py b/tests/models/independent/conftest.py index bd39d38a..b32c1609 100644 --- a/tests/models/independent/conftest.py +++ b/tests/models/independent/conftest.py @@ -84,22 +84,33 @@ def atac_pipe_name(): @pytest.fixture(scope="function") -def atacseq_piface_data(atacseq_iface_without_resources, - resources, atac_pipe_name): +def atacseq_iface_with_resources( + atacseq_iface_without_resources, resources): """ - Provide a test case with data for an ATACSeq PipelineInterface. :param dict atacseq_iface_without_resources: PipelineInterface config data, minus a resources section :param Mapping resources: resources section of PipelineInterface configuration data + :return Mapping: pipeline interface data for ATAC-Seq pipeline, with all + of the base sections plus resources section + """ + iface_data = copy.deepcopy(atacseq_iface_without_resources) + iface_data["resources"] = copy.deepcopy(resources) + return iface_data + + + +@pytest.fixture(scope="function") +def atacseq_piface_data(atacseq_iface_with_resources, atac_pipe_name): + """ + Provide a test case with data for an ATACSeq PipelineInterface. + :param str atac_pipe_name: name/key for the pipeline to which the interface data pertains :return dict: configuration data needed to create PipelineInterface """ - piface = copy.deepcopy(atacseq_iface_without_resources) - piface.update(resources) - return {atac_pipe_name: piface} + return {atac_pipe_name: copy.deepcopy(atacseq_iface_with_resources)} diff --git a/tests/models/independent/test_ProtocolInterface.py b/tests/models/independent/test_ProtocolInterface.py index efa46229..090ff7cc 100644 --- a/tests/models/independent/test_ProtocolInterface.py +++ b/tests/models/independent/test_ProtocolInterface.py @@ -1,9 +1,13 @@ """ Tests for ProtocolInterface, for Project/PipelineInterface interaction. """ +import __builtin__ +import inspect import logging +import mock import os import pytest import yaml +import looper from looper import models, DEV_LOGGING_FMT from looper.models import ProtocolInterface @@ -167,7 +171,101 @@ def test_warns_about_nonexistent_pipeline_script_path( -@pytest.mark.skip("Not implemented") -class ProtocolInterfacePipelineSampleSubtypeTests: +class SampleSubtypeTests: """ ProtocolInterface attempts import of pipeline-specific Sample. """ - pass + + # Basic cases + # 1 -- unmapped pipeline + # 2 -- subtypes section is single string + # 3 -- subtypes section is mapping () + # 4 -- subtypes section is missing (use single Sample subclass if there is one, base Sample for 0 or > 1 Sample subtypes defined) + # 5 -- subtypes section is null --> ALWAYS USE BASE SAMPLE (backdoor user side mechanism for making this be so) + + # Import trouble cases + # No __main__ + # Argument parsing + # missing import(s) + + # Subcases + # 2 -- single string + # 2a -- named class isn't defined in the module + # 2b -- named class is in module but isn't defined + # + + @pytest.fixture(scope="function") + def subtypes_section_single(self, atac_pipe_name): + pass + + + @pytest.fixture(scope="function") + def subtypes_section_multiple(self, atac_pipe_name): + pass + + + @pytest.mark.parametrize( + argnames="pipe_key", + argvalues=["ATAC-Seq.py", "atacseq.py", "ATACSEQ.py", "ATACSEQ", + "atacseq", "ATAC-seq.py", "ATACseq.py"]) + @pytest.mark.parametrize( + argnames="protocol", + argvalues=["ATAC-Seq", "ATACSeq", "ATACseq", "ATAC-seq", "ATAC", + "ATACSEQ", "ATAC-SEQ", "atac", "atacseq", "atac-seq"]) + def test_pipeline_key_close_matches_dont_count( + self, tmpdir, pipe_key, protocol, atac_pipe_name, + atacseq_iface_with_resources): + """ Request for Sample subtype for unmapped pipeline is KeyError. """ + strict_pipe_key = atac_pipe_name + protocol_mapping = {protocol: strict_pipe_key} + path_config_file = _write_config_data( + protomap=protocol_mapping, + conf_data={strict_pipe_key: atacseq_iface_with_resources}, + dirpath=tmpdir.strpath) + piface = ProtocolInterface(path_config_file) + full_pipe_path = os.path.join(tmpdir.strpath, atac_pipe_name) + with pytest.raises(KeyError): + # Mismatch between pipeline key arg and strict key --> KeyError. + piface.fetch_sample_subtype( + protocol, pipe_key, full_pipe_path=full_pipe_path) + + + def test_protocol_match_is_fuzzy(self): + """ Punctuation and case mismatches are tolerated in protocol name. """ + pass + + + @pytest.mark.parametrize( + argnames="error_type", + argvalues=zip(*inspect.getmembers( + __builtin__, lambda o: inspect.isclass(o) and + issubclass(o, BaseException)))[1]) + def test_problematic_import_builtin_exception(self, error_type): + pass + + + @pytest.mark.parametrize( + argnames="error_type", + argvalues=zip(*inspect.getmembers( + looper.models, lambda o: inspect.isclass(o) and + issubclass(o, Exception)))[1]) + def test_problematic_import_custom_exception(self, error_type): + pass + + + def test_no_subtypes_section(self): + pass + + + def test_subtypes_section_maps_protocol_to_non_sample_subtype(self): + pass + + + def test_subtypes_section_single_subtype_name_is_sample_subtype(self): + pass + + + def test_subtypes_section_single_subtype_name_is_not_sample_subtype(self): + pass + + + def test_subtypes_section_mapping_missing_protocol(self): + pass From 6b9dcfb62b801d1d9dd9c94117c5f3004b14cf0f Mon Sep 17 00:00:00 2001 From: Vince Reuter Date: Mon, 26 Jun 2017 10:23:21 -0400 Subject: [PATCH 82/94] account for possibility of null return value being introduced into the protected import function by guarding with a final default Sample fallback; add tests for protocol matching --- looper/models.py | 59 ++++++++---- .../independent/test_ProtocolInterface.py | 95 +++++++++++++++---- 2 files changed, 121 insertions(+), 33 deletions(-) diff --git a/looper/models.py b/looper/models.py index 28e3d8c0..ba37a734 100644 --- a/looper/models.py +++ b/looper/models.py @@ -865,7 +865,7 @@ def build_submission_bundles(self, protocol, priority=True): if priority and len(job_submission_bundles) > 0: return job_submission_bundles[0] - this_protocol_pipelines = proto_iface.fetch(protocol) + this_protocol_pipelines = proto_iface.fetch_pipelines(protocol) if not this_protocol_pipelines: _LOGGER.warn("No mapping for protocol '%s' in %s", protocol, proto_iface) @@ -2465,6 +2465,18 @@ def __repr__(self): return "ProtocolInterface from '{}'".format(self.source or "Mapping") + def fetch_pipelines(self, protocol): + """ + Fetch the mapping for a particular protocol, null if unmapped. + + :param str protocol: name/key for the protocol for which to fetch the + pipeline(s) + :return str | Iterable[str] | NoneType: pipeline(s) to which the given + protocol is mapped, otherwise null + """ + return self.protomap.mappings.get(alpha_cased(protocol)) + + def fetch_sample_subtype( self, protocol, strict_pipe_key, full_pipe_path): """ @@ -2499,21 +2511,26 @@ def fetch_sample_subtype( subtype_name = None else: if subtypes is None: - _LOGGER.debug("Null Sample subtypes specified for pipeline: " - "'%s'; using base Sample type", strict_pipe_key) - # Designate lack of need to attempt pipeline module import. + # Designate lack of need for import attempt and provide + # class with name to format message below. subtype = Sample + _LOGGER.debug("Null %s subtype(s) section specified for " + "pipeline: '%s'; using base %s type", + subtype.__name__, strict_pipe_key, + subtype.__name__) elif isinstance(subtypes, str): subtype_name = subtypes _LOGGER.debug("Single subtype name for pipeline '%s' " "in interface from '%s': '%s'", subtype_name, strict_pipe_key, self.source) else: + temp_subtypes = { + alpha_cased(p): st for p, st in subtypes.items()} try: - temp_subtypes = {alpha_cased(p): st - for p, st in subtypes.items()} subtype_name = temp_subtypes[alpha_cased(protocol)] except KeyError: + # Designate lack of need for import attempt and provide + # class with name to format message below. subtype = Sample _LOGGER.debug("No %s subtype specified in interface from " "'%s': '%s', '%s'; known: %s", @@ -2523,24 +2540,32 @@ def fetch_sample_subtype( # subtype_name is defined if and only if subtype remained null. subtype = subtype or \ - _import_sample_subtype(full_pipe_path, subtype_name) + _import_sample_subtype(full_pipe_path, subtype_name) or \ + Sample _LOGGER.debug("Using Sample subtype: %s", subtype.__name__) return subtype - def fetch(self, protocol): - """ - Fetch the mapping for a particular protocol, null if unmapped. - - :param str protocol: - :return str | Iterable[str] | NoneType: pipeline(s) to which the given - protocol is mapped, otherwise null + @classmethod + def _parse_iface_data(cls, pipe_iface_data): """ - return self.protomap.mappings.get(alpha_cased(protocol)) + Parse data from mappings to set instance attributes. + The data that define a ProtocolInterface are a "protocol_mapping" + Mapping and a "pipelines" Mapping, which are used to create a + ProtocolMapper and a PipelineInterface, representing the configuration + data for pipeline(s) from a single location. There are a couple of + different ways (file, folder, and eventually, raw Mapping) to provide + this data, and this function provides some standardization to how + those data are processed, independent of input type/format. - @classmethod - def _parse_iface_data(cls, pipe_iface_data): + :param Mapping[str, Mapping] pipe_iface_data: mapping from section + name to section data mapping; more specifically, the protocol + mappings Mapping and the PipelineInterface mapping + :return list[(str, ProtocolMapper | PipelineInterface)]: pairs of + attribute name for the ProtocolInterface being created, and the + value for that attribute, + """ assignments = [("protocol_mapping", ProtocolMapper, "protomap"), ("pipelines", PipelineInterface, "pipe_iface")] attribute_values = [] diff --git a/tests/models/independent/test_ProtocolInterface.py b/tests/models/independent/test_ProtocolInterface.py index 090ff7cc..fc9af58e 100644 --- a/tests/models/independent/test_ProtocolInterface.py +++ b/tests/models/independent/test_ProtocolInterface.py @@ -2,6 +2,7 @@ import __builtin__ import inspect +import itertools import logging import mock import os @@ -9,7 +10,7 @@ import yaml import looper from looper import models, DEV_LOGGING_FMT -from looper.models import ProtocolInterface +from looper.models import ProtocolInterface, Sample __author__ = "Vince Reuter" @@ -192,6 +193,12 @@ class SampleSubtypeTests: # 2b -- named class is in module but isn't defined # + PROTOCOL_NAME_VARIANTS = [ + "ATAC-Seq", "ATACSeq", "ATACseq", "ATAC-seq", "ATAC", + "ATACSEQ", "ATAC-SEQ", "atac", "atacseq", "atac-seq"] + + + @pytest.fixture(scope="function") def subtypes_section_single(self, atac_pipe_name): pass @@ -204,33 +211,86 @@ def subtypes_section_multiple(self, atac_pipe_name): @pytest.mark.parametrize( argnames="pipe_key", - argvalues=["ATAC-Seq.py", "atacseq.py", "ATACSEQ.py", "ATACSEQ", - "atacseq", "ATAC-seq.py", "ATACseq.py"]) + argvalues=["{}.py".format(proto) for proto + in PROTOCOL_NAME_VARIANTS]) @pytest.mark.parametrize( argnames="protocol", - argvalues=["ATAC-Seq", "ATACSeq", "ATACseq", "ATAC-seq", "ATAC", - "ATACSEQ", "ATAC-SEQ", "atac", "atacseq", "atac-seq"]) - def test_pipeline_key_close_matches_dont_count( + argvalues=PROTOCOL_NAME_VARIANTS) + def test_pipeline_key_match_is_strict( self, tmpdir, pipe_key, protocol, atac_pipe_name, atacseq_iface_with_resources): """ Request for Sample subtype for unmapped pipeline is KeyError. """ + + # Create the ProtocolInterface. strict_pipe_key = atac_pipe_name protocol_mapping = {protocol: strict_pipe_key} - path_config_file = _write_config_data( - protomap=protocol_mapping, - conf_data={strict_pipe_key: atacseq_iface_with_resources}, - dirpath=tmpdir.strpath) - piface = ProtocolInterface(path_config_file) + confpath = _write_config_data( + protomap=protocol_mapping, dirpath=tmpdir.strpath, + conf_data={strict_pipe_key: atacseq_iface_with_resources}) + piface = ProtocolInterface(confpath) + + # The absolute pipeline path is the pipeline name, joined to the + # ProtocolInterface's pipelines location. This location is the + # location from which a Sample subtype import is attempted. full_pipe_path = os.path.join(tmpdir.strpath, atac_pipe_name) - with pytest.raises(KeyError): - # Mismatch between pipeline key arg and strict key --> KeyError. + + # TODO: update to pytest.raises(None) if/when 3.1 adoption. + # Match between pipeline key specified and the strict key used in + # the mapping --> no error while mismatch --> error. + if pipe_key == atac_pipe_name: piface.fetch_sample_subtype( - protocol, pipe_key, full_pipe_path=full_pipe_path) + protocol, pipe_key, full_pipe_path=full_pipe_path) + else: + with pytest.raises(KeyError): + piface.fetch_sample_subtype( + protocol, pipe_key, full_pipe_path=full_pipe_path) - def test_protocol_match_is_fuzzy(self): + @pytest.mark.parametrize( + argnames=["mapped_protocol", "requested_protocol"], + argvalues=itertools.combinations(PROTOCOL_NAME_VARIANTS, 2)) + def test_protocol_match_is_fuzzy( + self, tmpdir, mapped_protocol, atac_pipe_name, + requested_protocol, atacseq_piface_data): """ Punctuation and case mismatches are tolerated in protocol name. """ - pass + + # Needed to create the ProtocolInterface. + protomap = {mapped_protocol: atac_pipe_name} + # Needed to invoke the function under test. + full_pipe_path = os.path.join(tmpdir.strpath, atac_pipe_name) + + # PipelineInterface data provided maps name to actual interface data + # Mapping, so modify the ATAC-Seq mapping within that. + # In this test, we're interested in the resolution of the protocol + # name, that with it we can grab the name of a class. Thus, we + # need only an arbitrary class name about which we can make the + # relevant assertion(s). + test_class_name = "TotallyArbitrary" + atacseq_piface_data[atac_pipe_name]["sample_subtypes"] = \ + test_class_name + + # Write out configuration data and create the ProtocolInterface. + conf_path = _write_config_data( + protomap=protomap, conf_data=atacseq_piface_data, + dirpath=tmpdir.strpath) + piface = ProtocolInterface(conf_path) + + # Make the call under test, patching the function protected + # function that's called iff the protocol name match succeeds. + with mock.patch("looper.models._import_sample_subtype", + return_value=None) as mocked_import: + # Return value is irrelevant; the effect of the protocol name + # match/resolution is entirely observable via the argument to the + # protected import function. + piface.fetch_sample_subtype( + protocol=requested_protocol, + strict_pipe_key=atac_pipe_name, + full_pipe_path=full_pipe_path) + # When the protocol name match/resolution succeeds, the name of the + # Sample subtype class to which it was mapped is passed as an + # argument to the protected import function. + mocked_import.assert_called_with(full_pipe_path, test_class_name) + @pytest.mark.parametrize( @@ -239,6 +299,7 @@ def test_protocol_match_is_fuzzy(self): __builtin__, lambda o: inspect.isclass(o) and issubclass(o, BaseException)))[1]) def test_problematic_import_builtin_exception(self, error_type): + """ Base Sample is used if builtin exception on pipeline import. """ pass @@ -248,10 +309,12 @@ def test_problematic_import_builtin_exception(self, error_type): looper.models, lambda o: inspect.isclass(o) and issubclass(o, Exception)))[1]) def test_problematic_import_custom_exception(self, error_type): + """ Base Sample is used if custom exception on pipeline import. """ pass def test_no_subtypes_section(self): + """ """ pass From 47068920c0216351b0474016cc9d27ba689d3d90 Mon Sep 17 00:00:00 2001 From: Vince Reuter Date: Mon, 26 Jun 2017 13:27:40 -0400 Subject: [PATCH 83/94] ensure that sys.modules is cleaned up; make sure that we account for a pipeline's import of base Sample; more ProtocolInterface tests, passing --- looper/models.py | 25 ++- looper/utils.py | 11 + .../independent/test_ProtocolInterface.py | 204 +++++++++++++++--- 3 files changed, 200 insertions(+), 40 deletions(-) diff --git a/looper/models.py b/looper/models.py index ba37a734..5eba2dfd 100644 --- a/looper/models.py +++ b/looper/models.py @@ -2484,7 +2484,8 @@ def fetch_sample_subtype( :param str protocol: name of the relevant protocol :param str strict_pipe_key: key for specific pipeline in a pipeline - interface mapping declaration + interface mapping declaration; this must exactly match a key in + the PipelineInterface (or the Mapping that represent it) :param str full_pipe_path: (absolute, expanded) path to the pipeline script :return type: Sample subtype to use for jobs for the given protocol, @@ -2821,30 +2822,34 @@ def class_names(cs): pipeline_module, lambda obj: inspect.isclass(obj)) classes = [klazz for _, klazz in classes] _LOGGER.debug("Found %d classes: %s", len(classes), class_names(classes)) - sample_subtypes = filter(lambda c: issubclass(c, base_type), classes) - _LOGGER.debug("%d %s subtype(s): %s", len(sample_subtypes), - base_type.__name__, class_names(sample_subtypes)) + + # Base Sample could be imported; we want the true subtypes. + proper_subtypes = filter( + lambda c: issubclass(c, base_type) and c != base_type, + classes) + _LOGGER.debug("%d %s subtype(s): %s", len(proper_subtypes), + base_type.__name__, class_names(proper_subtypes)) # Determine course of action based on subtype request and number found. if not subtype_name: _LOGGER.debug("No specific subtype is requested from '%s'", pipeline_filepath) - if len(sample_subtypes) == 1: + if len(proper_subtypes) == 1: # No specific request and single subtype --> use single subtype. - subtype = sample_subtypes[0] + subtype = proper_subtypes[0] _LOGGER.debug("Single %s subtype found in '%s': '%s'", base_type.__name__, pipeline_filepath, subtype.__name__) return subtype else: # We can't arbitrarily select from among 0 or multiple subtypes. - _LOGGER.debug("%s subtype cannot be selected from %d in '%s'; " - "using base type", base_type.__name__, - len(sample_subtypes), pipeline_filepath) + _LOGGER.debug("%s subtype cannot be selected from %d found in " + "'%s'; using base type", base_type.__name__, + len(proper_subtypes), pipeline_filepath) return base_type else: # Specific subtype request --> look for match. - for st in sample_subtypes: + for st in proper_subtypes: if st.__name__ == subtype_name: _LOGGER.debug("Successfully imported %s from '%s'", subtype_name, pipeline_filepath) diff --git a/looper/utils.py b/looper/utils.py index dc17b6a0..fcd9bcc0 100644 --- a/looper/utils.py +++ b/looper/utils.py @@ -145,6 +145,16 @@ def import_from_source(name, module_filepath): raise ValueError("Path to alleged module file doesn't point to an " "extant file: '{}'".format(module_filepath)) + # We just want the module object, not the effect of altering modules. + try: + module_to_restore = sys.modules[name] + except KeyError: + def cleanup(): + del sys.modules[name] + else: + def cleanup(): + sys.modules[name] = module_to_restore + if sys.version_info >= (3, 5): from importlib import util as _il_util modspec = _il_util.spec_from_file_location( @@ -160,6 +170,7 @@ def import_from_source(name, module_filepath): loader = _il_mach.SourceFileLoader(name, module_filepath) mod = loader.load_module() + cleanup() return mod diff --git a/tests/models/independent/test_ProtocolInterface.py b/tests/models/independent/test_ProtocolInterface.py index fc9af58e..abdc4c77 100644 --- a/tests/models/independent/test_ProtocolInterface.py +++ b/tests/models/independent/test_ProtocolInterface.py @@ -4,11 +4,12 @@ import inspect import itertools import logging -import mock import os + +import mock import pytest import yaml -import looper + from looper import models, DEV_LOGGING_FMT from looper.models import ProtocolInterface, Sample @@ -17,6 +18,29 @@ __email__ = "vreuter@virginia.edu" +ATAC_PROTOCOL_NAME = "atac" + + +class CustomExceptionA(Exception): + def __init__(self, *args): + super(CustomExceptionA, self).__init__(*args) + +class CustomExceptionB(Exception): + def __init__(self, *args): + super(CustomExceptionB, self).__init__(*args) + +CUSTOM_EXCEPTIONS = [CustomExceptionA, CustomExceptionB] + + +# Test case parameterization, but here for import locality and +# to reduce clutter in the pararmeterization declaration. +BUILTIN_EXCEPTIONS_WITHOUT_REQUIRED_ARGUMENTS = \ + list(zip(*inspect.getmembers( + __builtin__, lambda o: inspect.isclass(o) and + issubclass(o, BaseException) and + not issubclass(o, UnicodeError)))[1]) + + def _write_config_data(protomap, conf_data, dirpath): """ @@ -66,7 +90,7 @@ def path_config_file(request, tmpdir, atac_pipe_name): # interface data bundle. for iface_bundle in conf_data.values(): iface_bundle["path"] = pipe_path - return _write_config_data(protomap={"ATAC": atac_pipe_name}, + return _write_config_data(protomap={ATAC_PROTOCOL_NAME: atac_pipe_name}, conf_data=conf_data, dirpath=tmpdir.strpath) @@ -198,17 +222,6 @@ class SampleSubtypeTests: "ATACSEQ", "ATAC-SEQ", "atac", "atacseq", "atac-seq"] - - @pytest.fixture(scope="function") - def subtypes_section_single(self, atac_pipe_name): - pass - - - @pytest.fixture(scope="function") - def subtypes_section_multiple(self, atac_pipe_name): - pass - - @pytest.mark.parametrize( argnames="pipe_key", argvalues=["{}.py".format(proto) for proto @@ -295,30 +308,100 @@ def test_protocol_match_is_fuzzy( @pytest.mark.parametrize( argnames="error_type", - argvalues=zip(*inspect.getmembers( - __builtin__, lambda o: inspect.isclass(o) and - issubclass(o, BaseException)))[1]) - def test_problematic_import_builtin_exception(self, error_type): + argvalues=CUSTOM_EXCEPTIONS + + BUILTIN_EXCEPTIONS_WITHOUT_REQUIRED_ARGUMENTS) + def test_problematic_import_builtin_exception( + self, tmpdir, error_type, atac_pipe_name, atacseq_piface_data): """ Base Sample is used if builtin exception on pipeline import. """ - pass + + # Values needed for object creation and function invocation + protocol = "ATAC" + protocol_mapping = {protocol: atac_pipe_name} + full_pipe_path = os.path.join(tmpdir.strpath, atac_pipe_name) + + # Modify the data for the ProtocolInterface and create it. + atacseq_piface_data[atac_pipe_name]["sample_subtypes"] = \ + {protocol: "IrrelevantClassname"} + conf_path = _write_config_data( + protomap=protocol_mapping, + conf_data=atacseq_piface_data, dirpath=tmpdir.strpath) + piface = ProtocolInterface(conf_path) + + # We want to test the effect of an encounter with an exception during + # the import attempt, so patch the relevant function with a function + # to raise the parameterized exception type. + with mock.patch( + "looper.utils.import_from_source", + side_effect=error_type()): + subtype = piface.fetch_sample_subtype( + protocol=protocol, strict_pipe_key=atac_pipe_name, + full_pipe_path=full_pipe_path) + # When the import hits an exception, the base Sample type is used. + assert subtype is Sample @pytest.mark.parametrize( - argnames="error_type", - argvalues=zip(*inspect.getmembers( - looper.models, lambda o: inspect.isclass(o) and - issubclass(o, Exception)))[1]) - def test_problematic_import_custom_exception(self, error_type): - """ Base Sample is used if custom exception on pipeline import. """ + argnames="num_sample_subclasses", argvalues=[0, 1, 2], + ids=lambda n_samples: + " num_sample_subclasses = {} ".format(n_samples)) + @pytest.mark.parametrize( + argnames="decoy_class", argvalues=[False, True], + ids=lambda decoy: " decoy_class = {} ".format(decoy)) + def test_no_subtypes_section( + self, tmpdir, path_config_file, atac_pipe_name, + num_sample_subclasses, decoy_class): + """ DEPENDS ON PIPELINE MODULE CONTENT """ + pipe_path = os.path.join(tmpdir.strpath, atac_pipe_name) + piface = ProtocolInterface(path_config_file) + sample_subclass_basename = "SampleSubclass" + sample_lines = [ + "class {basename}{index}(Sample):", + "\tdef __init__(*args, **kwargs):", + "\t\tsuper({basename}{index}, self).__init__(*args, **kwargs)"] + non_sample_class_lines = [ + "class NonSample(object):", "\tdef __init__(self):", + "\t\tsuper(NonSample, self).__init__()"] + def populate_sample_lines(n_classes): + return [[sample_lines[0].format(basename=sample_subclass_basename, + index=class_index), + sample_lines[1], + sample_lines[2].format(basename=sample_subclass_basename, + index=class_index)] + for class_index in range(n_classes)] + class_lines_pool = populate_sample_lines(num_sample_subclasses) + if decoy_class: + class_lines_pool.append(non_sample_class_lines) + for lines_order in itertools.permutations(class_lines_pool): + path_module_file = _create_module( + lines_by_class=lines_order, filepath=pipe_path) + subtype = piface.fetch_sample_subtype( + protocol=ATAC_PROTOCOL_NAME, + strict_pipe_key=atac_pipe_name, full_pipe_path=pipe_path) + if num_sample_subclasses == 1: + exp_subtype_name = "{}0".format(sample_subclass_basename) + else: + exp_subtype_name = Sample.__name__ + # DEBUG + try: + assert exp_subtype_name == subtype.__name__ + except AssertionError: + with open(pipe_path, 'r') as f: + print("LINES: {}".format("\n".join(f.readlines()))) + raise + + + @pytest.mark.parametrize( + argnames="spec_type", argvalues=["singleton", "mapping"]) + def test_Sample_as_name(self, tmpdir, spec_type): + """ A pipeline may redeclare Sample as a subtype name. """ pass - def test_no_subtypes_section(self): - """ """ + def test_subtypes_single_name_non_implemented(self): pass - def test_subtypes_section_maps_protocol_to_non_sample_subtype(self): + def test_subtypes_section_single_subtype_name_is_not_sample_subtype(self): pass @@ -326,9 +409,70 @@ def test_subtypes_section_single_subtype_name_is_sample_subtype(self): pass - def test_subtypes_section_single_subtype_name_is_not_sample_subtype(self): + def test_subtypes_mapping_to_non_implemented_class(self): + pass + + + def test_has_subtypes_mapping_but_protocol_doesnt_match(self): + pass + + + def test_subtypes_mapping_to_non_sample_subtype(self): + pass + + + def test_sample_grandchild(self): + """ The subtype to be used can be a grandchild of Sample. """ + pass + + + @pytest.fixture(scope="function") + def subtypes_section_single(self, atac_pipe_name): pass - def test_subtypes_section_mapping_missing_protocol(self): + @pytest.fixture(scope="function") + def subtypes_section_multiple(self, atac_pipe_name): pass + + + @pytest.fixture(scope="function") + def create_module(self, request, tmpdir): + num_sample_subclasses = \ + request.getfixturevalue("num_sample_subclasses") + num_non_sample_subclasses = \ + request.getfixturevalue("num_non_sample_subclasses") + path_module_file = os.path.join(tmpdir.strpath, "_dummy_classes.py") + with open(path_module_file, 'w') as modfile: + pass + + +def _create_module(lines_by_class, filepath): + """ + Write out lines that will defined a module. + + :param Sequence[str] lines_by_class: lines that define a class + :param str filepath: path to module file to create + :return str: path to the module file written + """ + header = "from looper.models import Sample" + lines = "\n\n".join( + [header] + ["\n".join(class_lines) + for class_lines in lines_by_class]) + with open(filepath, 'w') as modfile: + modfile.write("{}\n".format(lines)) + return filepath + + + +def _generate_lines_by_class(sample_subclass_vector): + """ + Generate lines of text that define dummy classes. + + The input is a vector of flags indicating the order in which the classes + should be defined, and whether each should derive from Sample. + + :param sample_subclass_vector: + :return: + """ + pass From be7c8a1664dd8f09057a21d10f79a61a8ce228d7 Mon Sep 17 00:00:00 2001 From: Vince Reuter Date: Mon, 26 Jun 2017 15:31:46 -0400 Subject: [PATCH 84/94] python 3, less verbosity, better messaging, fix format errors, random module naming not cleanup since that causes problems in the subtype constructor --- looper/looper.py | 1 - looper/models.py | 41 +++++++++++-------- looper/utils.py | 33 +++++++++------ .../independent/test_ProtocolInterface.py | 21 ++++++---- 4 files changed, 57 insertions(+), 39 deletions(-) diff --git a/looper/looper.py b/looper/looper.py index a2f296fd..604261bc 100755 --- a/looper/looper.py +++ b/looper/looper.py @@ -821,7 +821,6 @@ def main(): # Parse command-line arguments and establish logger. args, remaining_args = parse_arguments() - _LOGGER.info("Command: {} (Looper version: {})". format(args.command, __version__)) # Initialize project diff --git a/looper/models.py b/looper/models.py index 5eba2dfd..1b06ecbe 100644 --- a/looper/models.py +++ b/looper/models.py @@ -67,7 +67,8 @@ from .utils import \ alpha_cased, check_bam, check_fastq, expandpath, \ - get_file_size, import_from_source, parse_ftype, partition + get_file_size, import_from_source, parse_ftype, partition, \ + standard_stream_redirector # TODO: decide if we want to denote functions for export. @@ -172,7 +173,7 @@ def merge_sample(sample, merge_table, data_sources, derived_columns): merged_cols = {} if merge_table is None: - _LOGGER.debug("No data for sample merge, skipping") + _LOGGER.log(5, "No data for sample merge, skipping") return merged_cols if SAMPLE_NAME_COLNAME not in merge_table.columns: @@ -1105,10 +1106,10 @@ def _make_basic_samples(self): try: sample.data_path = sample.data_source except AttributeError: - _LOGGER.debug("Sample '%s' lacks data source; skipping " + _LOGGER.log(5, "Sample '%s' lacks data source; skipping " "data path assignment", sample.sample_name) else: - _LOGGER.debug("Path to sample data: '%s'", sample.data_source) + _LOGGER.log(5, "Path to sample data: '%s'", sample.data_source) samples.append(sample) return samples @@ -2788,26 +2789,34 @@ def _import_sample_subtype(pipeline_filepath, subtype_name=None): """ base_type = Sample - _, modname = _os.path.split(pipeline_filepath) - modname, _ = _os.path.splitext(modname) - try: - _LOGGER.debug("Attempting to import module defined by {}, " - "calling it {}".format(pipeline_filepath, modname)) - pipeline_module = import_from_source( - name=modname, module_filepath=pipeline_filepath) + _LOGGER.debug("Attempting to import module defined by {}". + format(pipeline_filepath)) + # TODO: consider more fine-grained control here. What if verbose + # TODO: logging is only to file, not to stdout/err? + if _LOGGER.getEffectiveLevel() > logging.DEBUG: + with open(_os.devnull, 'w') as temp_standard_streams: + with standard_stream_redirector(temp_standard_streams): + pipeline_module = import_from_source(pipeline_filepath) + else: + pipeline_module = import_from_source(pipeline_filepath) + except SystemExit: # SystemExit would be caught as BaseException, but SystemExit is # particularly suggestive of an a script without a conditional # check on __main__, and as such warrant a tailored message. _LOGGER.warn("'%s' appears to attempt to run on import; " - "does it lack a conditional on __main__? Using base %s", - base_type.__name__) + "does it lack a conditional on '__main__'? " + "Using base type: %s", + pipeline_filepath, base_type.__name__) return base_type + except (BaseException, Exception) as e: _LOGGER.warn("Using base %s because of failure in attempt to " - "import pipeline module: %s", base_type.__name__, e) + "import pipeline module '%s': %r", + base_type.__name__, pipeline_filepath, e) return base_type + else: _LOGGER.debug("Successfully imported pipeline module '%s', " "naming it '%s'", pipeline_filepath, @@ -2824,9 +2833,9 @@ def class_names(cs): _LOGGER.debug("Found %d classes: %s", len(classes), class_names(classes)) # Base Sample could be imported; we want the true subtypes. - proper_subtypes = filter( + proper_subtypes = list(filter( lambda c: issubclass(c, base_type) and c != base_type, - classes) + classes)) _LOGGER.debug("%d %s subtype(s): %s", len(proper_subtypes), base_type.__name__, class_names(proper_subtypes)) diff --git a/looper/utils.py b/looper/utils.py index fcd9bcc0..bc978bbf 100644 --- a/looper/utils.py +++ b/looper/utils.py @@ -2,8 +2,11 @@ from argparse import ArgumentParser from collections import Counter, defaultdict, Iterable +import contextlib import logging import os +import random +import string import subprocess as sp import yaml from ._version import __version__ @@ -128,16 +131,14 @@ def get_file_size(filename): -def import_from_source(name, module_filepath): +def import_from_source(module_filepath): """ Import a module from a particular filesystem location. - :param str name: name for the module when loaded :param str module_filepath: path to the file that constitutes the module to import :return module: module imported from the given location, named as indicated :raises ValueError: if path provided does not point to an extant file - :raises ImportError: if path provided is indeed an existing file, but the """ import sys @@ -145,16 +146,11 @@ def import_from_source(name, module_filepath): raise ValueError("Path to alleged module file doesn't point to an " "extant file: '{}'".format(module_filepath)) - # We just want the module object, not the effect of altering modules. - try: - module_to_restore = sys.modules[name] - except KeyError: - def cleanup(): - del sys.modules[name] - else: - def cleanup(): - sys.modules[name] = module_to_restore + # Randomly generate module name. + fname_chars = string.ascii_letters + string.digits + name = "".join(random.choice(fname_chars) for _ in range(20)) + # Import logic is version-dependent. if sys.version_info >= (3, 5): from importlib import util as _il_util modspec = _il_util.spec_from_file_location( @@ -170,7 +166,6 @@ def cleanup(): loader = _il_mach.SourceFileLoader(name, module_filepath) mod = loader.load_module() - cleanup() return mod @@ -253,6 +248,18 @@ def partition(items, test): +@contextlib.contextmanager +def standard_stream_redirector(stream): + import sys + genuine_stdout, genuine_stderr = sys.stdout, sys.stderr + sys.stdout, sys.stderr = stream, stream + try: + yield + finally: + sys.stdout, sys.stderr = genuine_stdout, genuine_stderr + + + class CommandChecker(object): """ Validate PATH availability of executables referenced by a config file. diff --git a/tests/models/independent/test_ProtocolInterface.py b/tests/models/independent/test_ProtocolInterface.py index abdc4c77..e54a2739 100644 --- a/tests/models/independent/test_ProtocolInterface.py +++ b/tests/models/independent/test_ProtocolInterface.py @@ -1,10 +1,14 @@ """ Tests for ProtocolInterface, for Project/PipelineInterface interaction. """ -import __builtin__ import inspect import itertools import logging import os +import sys +if sys.version_info < (3, ): + import __builtin__ as builtins +else: + import builtins import mock import pytest @@ -34,11 +38,11 @@ def __init__(self, *args): # Test case parameterization, but here for import locality and # to reduce clutter in the pararmeterization declaration. -BUILTIN_EXCEPTIONS_WITHOUT_REQUIRED_ARGUMENTS = \ - list(zip(*inspect.getmembers( - __builtin__, lambda o: inspect.isclass(o) and +_, BUILTIN_EXCEPTIONS_WITHOUT_REQUIRED_ARGUMENTS = \ + list(map(list, zip(*inspect.getmembers( + builtins, lambda o: inspect.isclass(o) and issubclass(o, BaseException) and - not issubclass(o, UnicodeError)))[1]) + not issubclass(o, UnicodeError))))) @@ -372,8 +376,7 @@ def populate_sample_lines(n_classes): if decoy_class: class_lines_pool.append(non_sample_class_lines) for lines_order in itertools.permutations(class_lines_pool): - path_module_file = _create_module( - lines_by_class=lines_order, filepath=pipe_path) + _create_module(lines_by_class=lines_order, filepath=pipe_path) subtype = piface.fetch_sample_subtype( protocol=ATAC_PROTOCOL_NAME, strict_pipe_key=atac_pipe_name, full_pipe_path=pipe_path) @@ -381,12 +384,12 @@ def populate_sample_lines(n_classes): exp_subtype_name = "{}0".format(sample_subclass_basename) else: exp_subtype_name = Sample.__name__ - # DEBUG try: assert exp_subtype_name == subtype.__name__ except AssertionError: with open(pipe_path, 'r') as f: - print("LINES: {}".format("\n".join(f.readlines()))) + print("PIPELINE MODULE LINES: {}". + format("".join(f.readlines()))) raise From 9977f14b84cd3987f79d6487bcb1f45d97ffe7a6 Mon Sep 17 00:00:00 2001 From: Vince Reuter Date: Mon, 26 Jun 2017 15:48:20 -0400 Subject: [PATCH 85/94] descriptions, cleanup, better organization --- looper/models.py | 4 + looper/utils.py | 9 ++ tests/conftest.py | 1 - .../models/independent/test_AttributeDict.py | 2 - .../independent/test_ProtocolInterface.py | 50 +++++----- tests/models/independent/test_Sample.py | 94 ------------------- tests/models/test_models_smoke.py | 2 +- tests/test_looper.py | 1 - 8 files changed, 35 insertions(+), 128 deletions(-) diff --git a/looper/models.py b/looper/models.py index 1b06ecbe..2f5b1b2b 100644 --- a/looper/models.py +++ b/looper/models.py @@ -2792,8 +2792,12 @@ def _import_sample_subtype(pipeline_filepath, subtype_name=None): try: _LOGGER.debug("Attempting to import module defined by {}". format(pipeline_filepath)) + # TODO: consider more fine-grained control here. What if verbose # TODO: logging is only to file, not to stdout/err? + + # Redirect standard streams during the import to prevent noisy + # error messaging in the shell that may distract or confuse a user. if _LOGGER.getEffectiveLevel() > logging.DEBUG: with open(_os.devnull, 'w') as temp_standard_streams: with standard_stream_redirector(temp_standard_streams): diff --git a/looper/utils.py b/looper/utils.py index bc978bbf..6f74ef5f 100644 --- a/looper/utils.py +++ b/looper/utils.py @@ -250,6 +250,15 @@ def partition(items, test): @contextlib.contextmanager def standard_stream_redirector(stream): + """ + Temporarily redirect stdout and stderr to another stream. + + This can be useful for capturing messages for easier inspection, or + for rerouting and essentially ignoring them, with the destination as + something like an opened os.devnull. + + :param FileIO[str] stream: temporary proxy for standard streams + """ import sys genuine_stdout, genuine_stderr = sys.stdout, sys.stderr sys.stdout, sys.stderr = stream, stream diff --git a/tests/conftest.py b/tests/conftest.py index cde80f02..46c73631 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -23,7 +23,6 @@ from looper.loodels import Project -# TODO: needed for interactive mode, but may crush cmdl option for setup. _LOGGER = logging.getLogger("looper") diff --git a/tests/models/independent/test_AttributeDict.py b/tests/models/independent/test_AttributeDict.py index f1f7f5e0..959447d5 100644 --- a/tests/models/independent/test_AttributeDict.py +++ b/tests/models/independent/test_AttributeDict.py @@ -162,8 +162,6 @@ class AttributeDictUpdateTests: """ - # TODO: ensure that we cover tests cases for both merged and non-merged. - _TOTALLY_ARBITRARY_VALUES = [ "abc", 123, (4, "text", ("nes", "ted")), list("-101") diff --git a/tests/models/independent/test_ProtocolInterface.py b/tests/models/independent/test_ProtocolInterface.py index e54a2739..d9596ee1 100644 --- a/tests/models/independent/test_ProtocolInterface.py +++ b/tests/models/independent/test_ProtocolInterface.py @@ -355,8 +355,12 @@ def test_no_subtypes_section( self, tmpdir, path_config_file, atac_pipe_name, num_sample_subclasses, decoy_class): """ DEPENDS ON PIPELINE MODULE CONTENT """ + + # Basic values to invoke the function under test pipe_path = os.path.join(tmpdir.strpath, atac_pipe_name) piface = ProtocolInterface(path_config_file) + + # How to define the Sample subtypes (and non-subtype) sample_subclass_basename = "SampleSubclass" sample_lines = [ "class {basename}{index}(Sample):", @@ -365,6 +369,14 @@ def test_no_subtypes_section( non_sample_class_lines = [ "class NonSample(object):", "\tdef __init__(self):", "\t\tsuper(NonSample, self).__init__()"] + + # We expect the subtype iff there's just one Sample subtype. + if num_sample_subclasses == 1: + exp_subtype_name = "{}0".format(sample_subclass_basename) + else: + exp_subtype_name = Sample.__name__ + + # Fill in the class definition template lines. def populate_sample_lines(n_classes): return [[sample_lines[0].format(basename=sample_subclass_basename, index=class_index), @@ -372,18 +384,21 @@ def populate_sample_lines(n_classes): sample_lines[2].format(basename=sample_subclass_basename, index=class_index)] for class_index in range(n_classes)] + + # Determine the groups of lines to permute. class_lines_pool = populate_sample_lines(num_sample_subclasses) if decoy_class: class_lines_pool.append(non_sample_class_lines) + + # Subtype fetch is independent of class declaration order, + # so validate each permutation. for lines_order in itertools.permutations(class_lines_pool): + # Write out class declarations and invoke the function under test. _create_module(lines_by_class=lines_order, filepath=pipe_path) subtype = piface.fetch_sample_subtype( protocol=ATAC_PROTOCOL_NAME, strict_pipe_key=atac_pipe_name, full_pipe_path=pipe_path) - if num_sample_subclasses == 1: - exp_subtype_name = "{}0".format(sample_subclass_basename) - else: - exp_subtype_name = Sample.__name__ + try: assert exp_subtype_name == subtype.__name__ except AssertionError: @@ -395,7 +410,8 @@ def populate_sample_lines(n_classes): @pytest.mark.parametrize( argnames="spec_type", argvalues=["singleton", "mapping"]) - def test_Sample_as_name(self, tmpdir, spec_type): + def test_Sample_as_name( + self, tmpdir, spec_type, atacseq_piface_data, atac_pipe_name): """ A pipeline may redeclare Sample as a subtype name. """ pass @@ -439,16 +455,6 @@ def subtypes_section_multiple(self, atac_pipe_name): pass - @pytest.fixture(scope="function") - def create_module(self, request, tmpdir): - num_sample_subclasses = \ - request.getfixturevalue("num_sample_subclasses") - num_non_sample_subclasses = \ - request.getfixturevalue("num_non_sample_subclasses") - path_module_file = os.path.join(tmpdir.strpath, "_dummy_classes.py") - with open(path_module_file, 'w') as modfile: - pass - def _create_module(lines_by_class, filepath): """ @@ -465,17 +471,3 @@ def _create_module(lines_by_class, filepath): with open(filepath, 'w') as modfile: modfile.write("{}\n".format(lines)) return filepath - - - -def _generate_lines_by_class(sample_subclass_vector): - """ - Generate lines of text that define dummy classes. - - The input is a vector of flags indicating the order in which the classes - should be defined, and whether each should derive from Sample. - - :param sample_subclass_vector: - :return: - """ - pass diff --git a/tests/models/independent/test_Sample.py b/tests/models/independent/test_Sample.py index 301f2c14..8c5772b4 100644 --- a/tests/models/independent/test_Sample.py +++ b/tests/models/independent/test_Sample.py @@ -15,19 +15,6 @@ -def pytest_generate_tests(metafunc): - """ Customization of this module's test cases. """ - if metafunc.cls == CustomSampleTests: - if "subclass_attrname" in metafunc.fixturenames: - metafunc.parametrize(argnames="subclass_attrname", - argvalues=["library", "protocol"]) - if "pipelines_type" in metafunc.fixturenames: - metafunc.parametrize(argnames="pipelines_type", - argvalues=["module", "package"]) - - - - class ParseSampleImplicationsTests: """ Tests for appending columns/fields to a Sample based on a mapping. """ @@ -147,87 +134,6 @@ def test_requires_sample_name(self, has_name, data_type): -@pytest.mark.skip("Not implemented") -class CustomSampleTests: - """ Bespoke Sample creation tests. """ - - - PROTOCOLS = ["WGBS", "RRBS", "ATAC-Seq", "RNA-seq"] - - - @pytest.mark.fixture(scope="function") - def sample_subclass_definition(self, tmpdir, request): - subclass_attrname = request.getfixturevalue("subclass_attrname") - pipelines_type = request.getfixturevalue("pipelines_type") - if "pipe_path" in request.fixturenames: - pipe_path = tmpdir.strpath - else: - pipe_path = request.getfixturevalue("pipe_path") - if pipelines_type == "module": - pipe_path = os.path.join(pipe_path, "pipelines.py") - elif pipelines_type == "package": - init_file = os.path.join(pipe_path, "__init__.py") - with open(init_file, 'w') as f: - pass - module_file = tempfile.NamedTemporaryFile(dir=pipe_path, suffix=".py", delete=False) - module_file.close() - with open(module_file.name, 'w') as modfile: - # TODO: write out definition. - pass - else: - raise ValueError( - "Unknown pipelines type: {}; module and package " - "are supported".format(pipelines_type)) - - # TODO: ensure cleanup. - request.addfinalizer() - - - DATA_FOR_SAMPLES = { - SAMPLE_NAME_COLNAME: ["sample{}".format(i) for i in range(3)], - "arbitrary-value": list(np.random.randint(-1000, 1000, size=3))} - - - CLASS_DEFINITION_LINES = """\"\"\" Sample subclass test file. \"\"\" - - from looper.models import Sample - - class DummySampleSubclass(Sample): - \"\"\" Subclass shell to test Project's Sample subclass seek sensitivity. \"\"\" - __{attribute_name}__ = {attribute_value} - pass - - class NotSampleSubclass(Sample): - \"\"\" Subclass shell to test Project's Sample subclass seek specificity. \"\"\" - __unrecognized__ = irrelevant - - """ - - - def test_generic_sample_for_unfindable_subclass(self): - """ If no Sample subclass is found, a generic Sample is created. """ - pass - - - def test_raw_pipelines_import_has_sample_subclass( - self, pipelines_type, subclass_attrname): - """ Project finds Sample subclass in pipelines package. """ - pass - - - def test_project_pipelines_dir_has_sample_subclass( - self, pipelines_type, subclass_attrname): - """ Project finds Sample subclass in optional pipelines_dir. """ - pass - - - def test_sample_subclass_messaging( - self, pipelines_type, subclass_attrname): - """ Sample subclass seek process provides info about procedure. """ - pass - - - @pytest.mark.parametrize( argnames="accessor", argvalues=["attr", "item"], ids=lambda access_mode: "accessor={}".format(access_mode)) diff --git a/tests/models/test_models_smoke.py b/tests/models/test_models_smoke.py index 2f55b1e1..ec06fd6d 100644 --- a/tests/models/test_models_smoke.py +++ b/tests/models/test_models_smoke.py @@ -57,7 +57,7 @@ def __init__(self): def test_repr_smoke(self, class_name, method_name): """ Object representation method successfully returns string. """ - # TODO: with pytest.raises(None) in 3.1+ + # TODO: "with pytest.raises(None)..." in 3.1+ assert str is type(getattr(class_name, method_name).__call__()) diff --git a/tests/test_looper.py b/tests/test_looper.py index e3e55ccb..ff3e24c2 100644 --- a/tests/test_looper.py +++ b/tests/test_looper.py @@ -36,7 +36,6 @@ class ProjectConstructorTest: argvalues=["required_inputs", "all_input_attr"]) def test_sample_required_inputs_not_set(self, proj, attr_name): """ Samples' inputs are not set in `Project` ctor. """ - # TODO: update this to check for null if design is changed as may be. with pytest.raises(AttributeError): getattr(proj.samples[nprand.randint(len(proj.samples))], attr_name) From 09c6b6365d8b503d8926c084aac35eb27f8524dc Mon Sep 17 00:00:00 2001 From: Vince Reuter Date: Mon, 26 Jun 2017 16:04:58 -0400 Subject: [PATCH 86/94] add the sample name collision tests --- .../independent/test_ProtocolInterface.py | 51 +++++++++++++++++-- 1 file changed, 46 insertions(+), 5 deletions(-) diff --git a/tests/models/independent/test_ProtocolInterface.py b/tests/models/independent/test_ProtocolInterface.py index d9596ee1..ea1f3176 100644 --- a/tests/models/independent/test_ProtocolInterface.py +++ b/tests/models/independent/test_ProtocolInterface.py @@ -22,7 +22,7 @@ __email__ = "vreuter@virginia.edu" -ATAC_PROTOCOL_NAME = "atac" +ATAC_PROTOCOL_NAME = "ATAC" class CustomExceptionA(Exception): @@ -129,7 +129,7 @@ def test_relpath_with_dot_becomes_absolute( exp_path = os.path.join(tmpdir.strpath, sans_dot_path) path_config_file = _write_config_data( - protomap={"ATAC": atac_pipe_name}, + protomap={ATAC_PROTOCOL_NAME: atac_pipe_name}, conf_data=atacseq_piface_data, dirpath=tmpdir.strpath) piface = ProtocolInterface(path_config_file) _, obs_path, _ = piface.finalize_pipeline_key_and_paths(atac_pipe_name) @@ -319,7 +319,7 @@ def test_problematic_import_builtin_exception( """ Base Sample is used if builtin exception on pipeline import. """ # Values needed for object creation and function invocation - protocol = "ATAC" + protocol = ATAC_PROTOCOL_NAME protocol_mapping = {protocol: atac_pipe_name} full_pipe_path = os.path.join(tmpdir.strpath, atac_pipe_name) @@ -409,11 +409,52 @@ def populate_sample_lines(n_classes): @pytest.mark.parametrize( - argnames="spec_type", argvalues=["singleton", "mapping"]) + argnames="spec_type", argvalues=[str, dict]) def test_Sample_as_name( self, tmpdir, spec_type, atacseq_piface_data, atac_pipe_name): """ A pipeline may redeclare Sample as a subtype name. """ - pass + + # General values for the test + subtype_name = Sample.__name__ + pipe_path = os.path.join(tmpdir.strpath, atac_pipe_name) + + # Define the subtype in the pipeline module. + lines = ["from looper.models import Sample\n", + "class {}({}):\n".format(subtype_name, subtype_name), + "\tdef __init__(self, *args, **kwargs):\n", + "\t\tsuper({}, self).__init__(*args, **kwargs)\n". + format(subtype_name)] + with open(pipe_path, 'w') as pipe_module_file: + for l in lines: + pipe_module_file.write(l) + + # Determine how to specify the subtype. + if spec_type is str: + section_value = subtype_name + elif spec_type is dict: + section_value = {ATAC_PROTOCOL_NAME: subtype_name} + else: + raise ValueError("Unexpected subtype specification: {}". + format(spec_type)) + + # Add the subtype specification, create the interface, and get subtype. + atacseq_piface_data[atac_pipe_name]["sample_subtypes"] = section_value + conf_path = _write_config_data( + protomap={ATAC_PROTOCOL_NAME: atac_pipe_name}, + conf_data=atacseq_piface_data, dirpath=tmpdir.strpath) + piface = ProtocolInterface(conf_path) + subtype = piface.fetch_sample_subtype( + protocol=ATAC_PROTOCOL_NAME, + strict_pipe_key=atac_pipe_name, full_pipe_path=pipe_path) + + # Establish that subclass relationship is improper. + assert issubclass(Sample, Sample) + # Our subtype derives from base Sample... + assert issubclass(subtype, Sample) + # ...but not vice-versa. + assert not issubclass(Sample, subtype) + # And we retained the name. + assert subtype.__name__ == Sample.__name__ def test_subtypes_single_name_non_implemented(self): From 36b46b81b8d44f0ad171b22c4e41e2f553d481c6 Mon Sep 17 00:00:00 2001 From: Vince Reuter Date: Mon, 26 Jun 2017 17:47:44 -0400 Subject: [PATCH 87/94] separate concerns; add unimplemented subtype class tests --- looper/models.py | 31 +++- tests/models/independent/test_Project.py | 3 +- .../independent/test_ProtocolInterface.py | 162 +++++++++++++----- 3 files changed, 143 insertions(+), 53 deletions(-) diff --git a/looper/models.py b/looper/models.py index 2f5b1b2b..560894ab 100644 --- a/looper/models.py +++ b/looper/models.py @@ -53,6 +53,7 @@ OrderedDict as _OrderedDict from functools import partial import glob +import inspect import itertools import logging import os as _os @@ -2826,20 +2827,15 @@ def _import_sample_subtype(pipeline_filepath, subtype_name=None): "naming it '%s'", pipeline_filepath, pipeline_module.__name__) - import inspect def class_names(cs): return ", ".join([c.__name__ for c in cs]) # Find classes from pipeline module and determine which derive from Sample. - classes = inspect.getmembers( - pipeline_module, lambda obj: inspect.isclass(obj)) - classes = [klazz for _, klazz in classes] + classes = _fetch_classes(pipeline_module) _LOGGER.debug("Found %d classes: %s", len(classes), class_names(classes)) # Base Sample could be imported; we want the true subtypes. - proper_subtypes = list(filter( - lambda c: issubclass(c, base_type) and c != base_type, - classes)) + proper_subtypes = _proper_subtypes(classes, base_type) _LOGGER.debug("%d %s subtype(s): %s", len(proper_subtypes), base_type.__name__, class_names(proper_subtypes)) @@ -2867,11 +2863,28 @@ def class_names(cs): _LOGGER.debug("Successfully imported %s from '%s'", subtype_name, pipeline_filepath) return st - _LOGGER.warn("No subtype from '%s' matches '%s'; using base: %s", - pipeline_filepath, subtype_name, base_type.__name__) + _LOGGER.warn("No %s subtype from '%s' matches '%s'; using base: %s", + base_type.__name__, pipeline_filepath, + subtype_name, base_type.__name__) return base_type +def _fetch_classes(mod): + try: + _, classes = zip(*inspect.getmembers( + mod, lambda o: inspect.isclass(o))) + except ValueError: + return [] + return list(classes) + + + +def _proper_subtypes(types, supertype): + return list(filter( + lambda t: issubclass(t, supertype) and t != supertype, types)) + + + def _is_member(item, items): return item in items diff --git a/tests/models/independent/test_Project.py b/tests/models/independent/test_Project.py index d079786e..4d3fe5f8 100644 --- a/tests/models/independent/test_Project.py +++ b/tests/models/independent/test_Project.py @@ -53,8 +53,7 @@ class ProjectConstructorTests: argnames="lazy", argvalues=[False, True], ids=lambda lazy: "lazy={}".format(lazy)) def test_no_merge_table_in_config( - self, tmpdir, spec_type, lazy, - proj_conf_data, path_sample_anns): + self, tmpdir, spec_type, lazy, proj_conf_data, path_sample_anns): """ Merge table attribute remains null if config lacks merge_table. """ metadata = proj_conf_data["metadata"] try: diff --git a/tests/models/independent/test_ProtocolInterface.py b/tests/models/independent/test_ProtocolInterface.py index ea1f3176..79079ede 100644 --- a/tests/models/independent/test_ProtocolInterface.py +++ b/tests/models/independent/test_ProtocolInterface.py @@ -16,6 +16,7 @@ from looper import models, DEV_LOGGING_FMT from looper.models import ProtocolInterface, Sample +from looper.utils import import_from_source __author__ = "Vince Reuter" @@ -45,23 +46,12 @@ def __init__(self, *args): not issubclass(o, UnicodeError))))) - -def _write_config_data(protomap, conf_data, dirpath): - """ - Write ProtocolInterface data to (temp)file. - - :param Mapping protomap: mapping from protocol name to pipeline key/name - :param Mapping conf_data: mapping from pipeline key/name to configuration - data for a PipelineInterface - :param str dirpath: path to filesystem location in which to place the - file to write - :return str: path to the (temp)file written - """ - full_conf_data = {"protocol_mapping": protomap, "pipelines": conf_data} - filepath = os.path.join(dirpath, "pipeline_interface.yaml") - with open(filepath, 'w') as conf_file: - yaml.safe_dump(full_conf_data, conf_file) - return filepath +def pytest_generate_tests(metafunc): + """ Customization of this module's test cases. """ + if "subtypes_section_spec_type" in metafunc.fixturenames: + # Subtypes section can be raw string or mapping. + metafunc.parametrize(argnames="subtypes_section_spec_type", + argvalues=[str, dict]) @@ -399,6 +389,8 @@ def populate_sample_lines(n_classes): protocol=ATAC_PROTOCOL_NAME, strict_pipe_key=atac_pipe_name, full_pipe_path=pipe_path) + # Make the assertion on subtype name, getting additional + # information about the module that we defined if there's failure. try: assert exp_subtype_name == subtype.__name__ except AssertionError: @@ -409,9 +401,10 @@ def populate_sample_lines(n_classes): @pytest.mark.parametrize( - argnames="spec_type", argvalues=[str, dict]) + argnames="subtype_name", argvalues=[Sample.__name__]) def test_Sample_as_name( - self, tmpdir, spec_type, atacseq_piface_data, atac_pipe_name): + self, tmpdir, subtype_name, atac_pipe_name, + atacseq_piface_data_with_subtypes): """ A pipeline may redeclare Sample as a subtype name. """ # General values for the test @@ -428,20 +421,10 @@ def test_Sample_as_name( for l in lines: pipe_module_file.write(l) - # Determine how to specify the subtype. - if spec_type is str: - section_value = subtype_name - elif spec_type is dict: - section_value = {ATAC_PROTOCOL_NAME: subtype_name} - else: - raise ValueError("Unexpected subtype specification: {}". - format(spec_type)) - - # Add the subtype specification, create the interface, and get subtype. - atacseq_piface_data[atac_pipe_name]["sample_subtypes"] = section_value conf_path = _write_config_data( protomap={ATAC_PROTOCOL_NAME: atac_pipe_name}, - conf_data=atacseq_piface_data, dirpath=tmpdir.strpath) + conf_data=atacseq_piface_data_with_subtypes, + dirpath=tmpdir.strpath) piface = ProtocolInterface(conf_path) subtype = piface.fetch_sample_subtype( protocol=ATAC_PROTOCOL_NAME, @@ -457,15 +440,62 @@ def test_Sample_as_name( assert subtype.__name__ == Sample.__name__ - def test_subtypes_single_name_non_implemented(self): - pass + @pytest.mark.parametrize( + argnames="include_decoy", argvalues=[False, True]) + @pytest.mark.parametrize(argnames="subtype_name", argvalues=["NonSample"]) + @pytest.mark.parametrize( + argnames="test_type", + argvalues=["return_sample", "class_found"]) + def test_subtypes_non_implemented( + self, tmpdir, atac_pipe_name, + subtype_name, test_type, include_decoy, + atacseq_piface_data_with_subtypes, subtypes_section_spec_type): + """ Subtype in interface but not in pipeline is exceptional. """ + pipe_path = os.path.join(tmpdir.strpath, atac_pipe_name) - def test_subtypes_section_single_subtype_name_is_not_sample_subtype(self): - pass + # Write out the pipeline module file. + if include_decoy: + lines = ["class {}(object):\n".format(subtype_name), + "\tdef __init__(self, *args, **kwarggs):\n", + "\t\tsuper({}, self).__init__(*args, **kwargs)". + format(subtype_name)] + else: + lines = [] + with open(pipe_path, 'w') as pipe_module_file: + for l in lines: + pipe_module_file.write(l) + # Create the ProtocolInterface. + path_config_file = _write_config_data( + protomap={ATAC_PROTOCOL_NAME: atac_pipe_name}, + conf_data=atacseq_piface_data_with_subtypes, + dirpath=tmpdir.strpath) + piface = ProtocolInterface(path_config_file) - def test_subtypes_section_single_subtype_name_is_sample_subtype(self): + # Perform the call under test. + kwargs = {"protocol": ATAC_PROTOCOL_NAME, + "strict_pipe_key": atac_pipe_name, + "full_pipe_path": pipe_path} + if test_type not in ["return_sample", "class_found"]: + raise ValueError("Unexpected test type: {}".format(test_type)) + if test_type == "return_sample": + # We should always get back the base Sample... + subtype = piface.fetch_sample_subtype(**kwargs) + assert subtype is Sample + else: + with mock.patch("looper.models._proper_subtypes") as mocked_filter: + piface.fetch_sample_subtype(**kwargs) + # but have found the decoy class only if present. + exp_cls_names = [subtype_name] if include_decoy else [] + positional_arguments = mocked_filter.call_args[0] + types_found = positional_arguments[0] + obs_cls_names = [t.__name__ for t in types_found] + assert exp_cls_names == obs_cls_names + + + def test_subtype_is_not_Sample(self): + """ Subtype that doesn't extend Sample isn't used. """ pass @@ -473,27 +503,56 @@ def test_subtypes_mapping_to_non_implemented_class(self): pass - def test_has_subtypes_mapping_but_protocol_doesnt_match(self): + def test_subtypes_mapping_to_non_sample_subtype(self): pass - def test_subtypes_mapping_to_non_sample_subtype(self): + def test_subtypes_section_is_sample_subtype(self): + # Parameterize over mapping and singleton pass def test_sample_grandchild(self): """ The subtype to be used can be a grandchild of Sample. """ + # Can parameterize but don't need to pass - @pytest.fixture(scope="function") - def subtypes_section_single(self, atac_pipe_name): + def test_has_subtypes_mapping_but_protocol_doesnt_match(self): + # Intrinsic to mapping; this is a failure case. pass @pytest.fixture(scope="function") - def subtypes_section_multiple(self, atac_pipe_name): - pass + def atacseq_piface_data_with_subtypes( + self, request, atacseq_piface_data, atac_pipe_name): + """ + Provide test case with ProtocolInterface data. + + :param pytest._pytest.fixtures.SubRequest request: test case + requesting the parameterization + :param Mapping atacseq_piface_data: the ProtocolInterface data + :param str atac_pipe_name: name for the pipeline + :return Mapping: same as input, but with Sample subtype specification + section mixed in + """ + + # Get the test case's parameterized values. + spec_type = request.getfixturevalue("subtypes_section_spec_type") + subtype_name = request.getfixturevalue("subtype_name") + + # Determine how to specify the subtype(s). + if spec_type is str: + section_value = subtype_name + elif spec_type is dict: + section_value = {ATAC_PROTOCOL_NAME: subtype_name} + else: + raise ValueError("Unexpected subtype section specification type: " + "{}".format(spec_type)) + + # Update and return the interface data. + atacseq_piface_data[atac_pipe_name]["sample_subtypes"] = section_value + return atacseq_piface_data @@ -512,3 +571,22 @@ def _create_module(lines_by_class, filepath): with open(filepath, 'w') as modfile: modfile.write("{}\n".format(lines)) return filepath + + + +def _write_config_data(protomap, conf_data, dirpath): + """ + Write ProtocolInterface data to (temp)file. + + :param Mapping protomap: mapping from protocol name to pipeline key/name + :param Mapping conf_data: mapping from pipeline key/name to configuration + data for a PipelineInterface + :param str dirpath: path to filesystem location in which to place the + file to write + :return str: path to the (temp)file written + """ + full_conf_data = {"protocol_mapping": protomap, "pipelines": conf_data} + filepath = os.path.join(dirpath, "pipeline_interface.yaml") + with open(filepath, 'w') as conf_file: + yaml.safe_dump(full_conf_data, conf_file) + return filepath From 838669b7f2f2cd4f664f63729c8b86cbfe581e48 Mon Sep 17 00:00:00 2001 From: Vince Reuter Date: Mon, 26 Jun 2017 17:50:18 -0400 Subject: [PATCH 88/94] add the proper fixture hook for parameterization --- tests/models/independent/test_ProtocolInterface.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/models/independent/test_ProtocolInterface.py b/tests/models/independent/test_ProtocolInterface.py index 79079ede..b9e88c7b 100644 --- a/tests/models/independent/test_ProtocolInterface.py +++ b/tests/models/independent/test_ProtocolInterface.py @@ -404,7 +404,7 @@ def populate_sample_lines(n_classes): argnames="subtype_name", argvalues=[Sample.__name__]) def test_Sample_as_name( self, tmpdir, subtype_name, atac_pipe_name, - atacseq_piface_data_with_subtypes): + subtypes_section_spec_type, atacseq_piface_data_with_subtypes): """ A pipeline may redeclare Sample as a subtype name. """ # General values for the test From e65254de17817194ae95fc7dd350479c9b13f9c8 Mon Sep 17 00:00:00 2001 From: Vince Reuter Date: Mon, 26 Jun 2017 18:57:28 -0400 Subject: [PATCH 89/94] stricter subtype enforcement, cleanup, add tests --- looper/models.py | 23 ++--- .../independent/test_ProtocolInterface.py | 85 +++++++++---------- 2 files changed, 47 insertions(+), 61 deletions(-) diff --git a/looper/models.py b/looper/models.py index 560894ab..e9d34afd 100644 --- a/looper/models.py +++ b/looper/models.py @@ -2765,15 +2765,6 @@ def __init__(self, pipeline): -class _UndefinedSampleSubtypeException(Exception): - """ Sample subtype--if declared in PipelineInterface--must be found. """ - def __init__(self, subtype_name, pipeline_filepath): - reason = "Sample subtype {} cannot be imported from '{}'".\ - format(subtype_name, pipeline_filepath) - super(_UndefinedSampleSubtypeException, self).__init__(reason) - - - def _import_sample_subtype(pipeline_filepath, subtype_name=None): """ Import a particular Sample subclass from a Python module. @@ -2785,8 +2776,6 @@ def _import_sample_subtype(pipeline_filepath, subtype_name=None): be used; otherwise, the base Sample type will be used. :return type: the imported class, defaulting to base Sample in case of failure with the import or other logic - :raises _UndefinedSampleSubtypeException: if the module is imported but - type indicated by subtype_name is not found as a class """ base_type = Sample @@ -2863,14 +2852,16 @@ def class_names(cs): _LOGGER.debug("Successfully imported %s from '%s'", subtype_name, pipeline_filepath) return st - _LOGGER.warn("No %s subtype from '%s' matches '%s'; using base: %s", - base_type.__name__, pipeline_filepath, - subtype_name, base_type.__name__) - return base_type + raise ValueError( + "'{}' matches none of the {} {} subtype(s) defined " + "in '{}': {}".format(subtype_name, len(proper_subtypes), + base_type.__name__, pipeline_filepath, + class_names(proper_subtypes))) def _fetch_classes(mod): + """ Return the classes defined in a module. """ try: _, classes = zip(*inspect.getmembers( mod, lambda o: inspect.isclass(o))) @@ -2881,10 +2872,12 @@ def _fetch_classes(mod): def _proper_subtypes(types, supertype): + """ Determine the proper subtypes of a supertype. """ return list(filter( lambda t: issubclass(t, supertype) and t != supertype, types)) def _is_member(item, items): + """ Determine whether an iterm is a member of a collection. """ return item in items diff --git a/tests/models/independent/test_ProtocolInterface.py b/tests/models/independent/test_ProtocolInterface.py index b9e88c7b..42ba887f 100644 --- a/tests/models/independent/test_ProtocolInterface.py +++ b/tests/models/independent/test_ProtocolInterface.py @@ -440,71 +440,56 @@ def test_Sample_as_name( assert subtype.__name__ == Sample.__name__ - @pytest.mark.parametrize( - argnames="include_decoy", argvalues=[False, True]) @pytest.mark.parametrize(argnames="subtype_name", argvalues=["NonSample"]) @pytest.mark.parametrize( - argnames="test_type", - argvalues=["return_sample", "class_found"]) - def test_subtypes_non_implemented( - self, tmpdir, atac_pipe_name, - subtype_name, test_type, include_decoy, + argnames="test_type", argvalues=["return_sample", "class_found"]) + def test_subtype_is_not_Sample( + self, tmpdir, atac_pipe_name, subtype_name, test_type, atacseq_piface_data_with_subtypes, subtypes_section_spec_type): """ Subtype in interface but not in pipeline is exceptional. """ pipe_path = os.path.join(tmpdir.strpath, atac_pipe_name) - # Write out the pipeline module file. - if include_decoy: - lines = ["class {}(object):\n".format(subtype_name), - "\tdef __init__(self, *args, **kwarggs):\n", - "\t\tsuper({}, self).__init__(*args, **kwargs)". - format(subtype_name)] - else: - lines = [] + # Write out pipeline module file with non-Sample class definition. + lines = _class_definition_lines(subtype_name) with open(pipe_path, 'w') as pipe_module_file: for l in lines: pipe_module_file.write(l) - # Create the ProtocolInterface. + # Create the ProtocolInterface and do the test call. path_config_file = _write_config_data( protomap={ATAC_PROTOCOL_NAME: atac_pipe_name}, conf_data=atacseq_piface_data_with_subtypes, dirpath=tmpdir.strpath) piface = ProtocolInterface(path_config_file) - - # Perform the call under test. - kwargs = {"protocol": ATAC_PROTOCOL_NAME, - "strict_pipe_key": atac_pipe_name, - "full_pipe_path": pipe_path} - if test_type not in ["return_sample", "class_found"]: - raise ValueError("Unexpected test type: {}".format(test_type)) - if test_type == "return_sample": - # We should always get back the base Sample... - subtype = piface.fetch_sample_subtype(**kwargs) - assert subtype is Sample - else: - with mock.patch("looper.models._proper_subtypes") as mocked_filter: - piface.fetch_sample_subtype(**kwargs) - # but have found the decoy class only if present. - exp_cls_names = [subtype_name] if include_decoy else [] - positional_arguments = mocked_filter.call_args[0] - types_found = positional_arguments[0] - obs_cls_names = [t.__name__ for t in types_found] - assert exp_cls_names == obs_cls_names + with pytest.raises(ValueError): + piface.fetch_sample_subtype( + protocol=ATAC_PROTOCOL_NAME, + strict_pipe_key=atac_pipe_name, full_pipe_path=pipe_path) - def test_subtype_is_not_Sample(self): + @pytest.mark.parametrize(argnames="subtype_name", argvalues=["irrelevant"]) + @pytest.mark.parametrize(argnames="decoy_class", argvalues=[False, True], + ids=lambda decoy: " decoy = {} ".format(decoy)) + def test_subtype_not_implemented( + self, tmpdir, atac_pipe_name, subtype_name, decoy_class, + atacseq_piface_data_with_subtypes, subtypes_section_spec_type): """ Subtype that doesn't extend Sample isn't used. """ - pass - - - def test_subtypes_mapping_to_non_implemented_class(self): - pass - - - def test_subtypes_mapping_to_non_sample_subtype(self): - pass + # Create the pipeline module. + pipe_path = os.path.join(tmpdir.strpath, atac_pipe_name) + lines = _class_definition_lines(name="Decoy") if decoy_class else [] + with open(pipe_path, 'w') as modfile: + for l in lines: + modfile.write(l) + conf_path = _write_config_data( + protomap={ATAC_PROTOCOL_NAME: atac_pipe_name}, + conf_data=atacseq_piface_data_with_subtypes, + dirpath=tmpdir.strpath) + piface = ProtocolInterface(conf_path) + with pytest.raises(ValueError): + piface.fetch_sample_subtype( + protocol=ATAC_PROTOCOL_NAME, + strict_pipe_key=atac_pipe_name, full_pipe_path=pipe_path) def test_subtypes_section_is_sample_subtype(self): @@ -556,6 +541,14 @@ def atacseq_piface_data_with_subtypes( +def _class_definition_lines(name): + """ Create lines that define a class. """ + return ["class {}(object):\n".format(name), + "\tdef __init__(self, *args, **kwarggs):\n", + "\t\tsuper({}, self).__init__(*args, **kwargs)".format(name)] + + + def _create_module(lines_by_class, filepath): """ Write out lines that will defined a module. From e310833cc450004edc087fa0a6860bdcc3491f0d Mon Sep 17 00:00:00 2001 From: Vince Reuter Date: Mon, 26 Jun 2017 21:59:44 -0400 Subject: [PATCH 90/94] first pass at finishing off tests --- .../independent/test_ProtocolInterface.py | 131 +++++++++++++++--- 1 file changed, 112 insertions(+), 19 deletions(-) diff --git a/tests/models/independent/test_ProtocolInterface.py b/tests/models/independent/test_ProtocolInterface.py index 42ba887f..bd0d7dcb 100644 --- a/tests/models/independent/test_ProtocolInterface.py +++ b/tests/models/independent/test_ProtocolInterface.py @@ -16,13 +16,13 @@ from looper import models, DEV_LOGGING_FMT from looper.models import ProtocolInterface, Sample -from looper.utils import import_from_source __author__ = "Vince Reuter" __email__ = "vreuter@virginia.edu" +SUBTYPES_KEY = ProtocolInterface.SUBTYPE_MAPPING_SECTION ATAC_PROTOCOL_NAME = "ATAC" @@ -273,7 +273,7 @@ def test_protocol_match_is_fuzzy( # need only an arbitrary class name about which we can make the # relevant assertion(s). test_class_name = "TotallyArbitrary" - atacseq_piface_data[atac_pipe_name]["sample_subtypes"] = \ + atacseq_piface_data[atac_pipe_name][SUBTYPES_KEY] = \ test_class_name # Write out configuration data and create the ProtocolInterface. @@ -314,7 +314,7 @@ def test_problematic_import_builtin_exception( full_pipe_path = os.path.join(tmpdir.strpath, atac_pipe_name) # Modify the data for the ProtocolInterface and create it. - atacseq_piface_data[atac_pipe_name]["sample_subtypes"] = \ + atacseq_piface_data[atac_pipe_name][SUBTYPES_KEY] = \ {protocol: "IrrelevantClassname"} conf_path = _write_config_data( protomap=protocol_mapping, @@ -451,7 +451,7 @@ def test_subtype_is_not_Sample( pipe_path = os.path.join(tmpdir.strpath, atac_pipe_name) # Write out pipeline module file with non-Sample class definition. - lines = _class_definition_lines(subtype_name) + lines = _class_definition_lines(subtype_name, name_super_type="object") with open(pipe_path, 'w') as pipe_module_file: for l in lines: pipe_module_file.write(l) @@ -477,7 +477,8 @@ def test_subtype_not_implemented( """ Subtype that doesn't extend Sample isn't used. """ # Create the pipeline module. pipe_path = os.path.join(tmpdir.strpath, atac_pipe_name) - lines = _class_definition_lines(name="Decoy") if decoy_class else [] + lines = _class_definition_lines("Decoy", "object") \ + if decoy_class else [] with open(pipe_path, 'w') as modfile: for l in lines: modfile.write(l) @@ -491,21 +492,112 @@ def test_subtype_not_implemented( protocol=ATAC_PROTOCOL_NAME, strict_pipe_key=atac_pipe_name, full_pipe_path=pipe_path) + + @pytest.mark.parametrize( + argnames="subtype_name", argvalues=["SubsampleA", "SubsampleB"]) + def test_matches_sample_subtype( + self, tmpdir, atac_pipe_name, subtype_name, atacseq_piface_data): + """ Fetch of subtype is specific even from among multiple subtypes. """ - def test_subtypes_section_is_sample_subtype(self): - # Parameterize over mapping and singleton - pass + # Basic values + pipe_path = os.path.join(tmpdir.strpath, atac_pipe_name) + decoy_class = "Decoy" + decoy_proto = "DECOY" + # Update the ProtocolInterface data and write it out. + atacseq_piface_data[SUBTYPES_KEY] = { + ATAC_PROTOCOL_NAME: subtype_name, decoy_proto: decoy_class} + conf_path = _write_config_data( + protomap={ATAC_PROTOCOL_NAME: atac_pipe_name, + decoy_proto: atac_pipe_name}, + conf_data=atacseq_piface_data, dirpath=tmpdir.strpath) + + # Create the collection of definition lines for each class. + legit_lines = _class_definition_lines(subtype_name, Sample.__name__) + decoy_lines = _class_definition_lines(decoy_class, Sample.__name__) + + for lines_order in itertools.permutations([legit_lines, decoy_lines]): + with open(pipe_path, 'w') as pipe_mod_file: + for class_lines in lines_order: + for line in class_lines: + pipe_mod_file.write(line) + pipe_mod_file.write("\n\n") + + # We need the new pipeline module file in place before the + # ProtocolInterface is created. + piface = ProtocolInterface(conf_path) + subtype = piface.fetch_sample_subtype( + protocol=ATAC_PROTOCOL_NAME, + strict_pipe_key=atac_pipe_name, full_pipe_path=pipe_path) + assert subtype_name == subtype.__name__ - def test_sample_grandchild(self): - """ The subtype to be used can be a grandchild of Sample. """ - # Can parameterize but don't need to - pass + @pytest.mark.parametrize( + argnames="spec_type", argvalues=["single", "nested"]) + def test_subtypes_list( + self, tmpdir, atac_pipe_name, atacseq_piface_data, spec_type): + """ As singleton or within mapping, only 1 subtype allowed. """ + + pipe_path = os.path.join(tmpdir.strpath, atac_pipe_name) - def test_has_subtypes_mapping_but_protocol_doesnt_match(self): - # Intrinsic to mapping; this is a failure case. - pass + # Define the classes, writing them in the pipeline module file. + subtype_names = ["ArbitraryA", "PlaceholderB"] + with open(pipe_path, 'w') as pipe_module_file: + for subtype_name in subtype_names: + # Have the classes be Sample subtypes. + for line in _class_definition_lines( + subtype_name, name_super_type=Sample.__name__): + pipe_module_file.write(line) + + # Update the ProtocolInterface data. + subtype_section = subtype_names if spec_type == "single" \ + else {ATAC_PROTOCOL_NAME: subtype_names} + atacseq_piface_data[SUBTYPES_KEY] = subtype_section + + # Create the ProtocolInterface. + conf_path = _write_config_data( + protomap={ATAC_PROTOCOL_NAME: atac_pipe_name}, + conf_data=atacseq_piface_data, dirpath=tmpdir.strpath) + piface = ProtocolInterface(conf_path) + + # We don't really care about exception type, just that one arises. + with pytest.raises(Exception): + piface.fetch_sample_subtype( + protocol=ATAC_PROTOCOL_NAME, + strict_pipe_key=atac_pipe_name, full_pipe_path=pipe_path) + + + @pytest.mark.parametrize( + argnames="target", argvalues=["middle", "bottom"]) + @pytest.mark.parametrize( + argnames="spec_type", argvalues=["single", "mapping"]) + def test_sample_grandchild( + self, tmpdir, spec_type, target, + atacseq_piface_data, name_atac_pipe): + """ The subtype to be used can be a grandchild of Sample. """ + pipe_path = os.path.join(tmpdir.strpath, name_atac_pipe) + intermediate_sample_subtype = "Middle" + leaf_sample_subtype = "Leaf" + intermediate_subtype_lines = _class_definition_lines( + intermediate_sample_subtype, Sample.__name__) + leaf_subtype_lines = _class_definition_lines( + leaf_sample_subtype, intermediate_sample_subtype) + with open(pipe_path, 'w') as pipe_mod_file: + for l in intermediate_subtype_lines: + pipe_mod_file.write(l) + pipe_mod_file.write("\n\n") + for l in leaf_subtype_lines: + pipe_mod_file.write(l) + atacseq_piface_data[SUBTYPES_KEY] = target if spec_type == "single" \ + else {ATAC_PROTOCOL_NAME: target} + conf_path = _write_config_data( + protomap={ATAC_PROTOCOL_NAME: name_atac_pipe}, + conf_data=atacseq_piface_data, dirpath=tmpdir.strpath) + piface = ProtocolInterface(conf_path) + subtype = piface.fetch_sample_subtype( + protocol=ATAC_PROTOCOL_NAME, strict_pipe_key=name_atac_pipe, + full_pipe_path=pipe_path) + assert target == subtype.__name__ @pytest.fixture(scope="function") @@ -536,16 +628,17 @@ def atacseq_piface_data_with_subtypes( "{}".format(spec_type)) # Update and return the interface data. - atacseq_piface_data[atac_pipe_name]["sample_subtypes"] = section_value + atacseq_piface_data[atac_pipe_name][SUBTYPES_KEY] = section_value return atacseq_piface_data -def _class_definition_lines(name): +def _class_definition_lines(name, name_super_type): """ Create lines that define a class. """ - return ["class {}(object):\n".format(name), + return ["class {t}({st}):\n".format(name), "\tdef __init__(self, *args, **kwarggs):\n", - "\t\tsuper({}, self).__init__(*args, **kwargs)".format(name)] + "\t\tsuper({t}, self).__init__(*args, **kwargs)".format( + t=name, st=name_super_type)] From 33ea9c1d7e75c108b1cc367267e32d222f9f47a8 Mon Sep 17 00:00:00 2001 From: Vince Reuter Date: Mon, 26 Jun 2017 22:22:55 -0400 Subject: [PATCH 91/94] fix the few test mistakes --- looper/models.py | 2 +- .../independent/test_ProtocolInterface.py | 36 ++++++++++++------- 2 files changed, 25 insertions(+), 13 deletions(-) diff --git a/looper/models.py b/looper/models.py index e9d34afd..644eb673 100644 --- a/looper/models.py +++ b/looper/models.py @@ -2825,7 +2825,7 @@ def class_names(cs): # Base Sample could be imported; we want the true subtypes. proper_subtypes = _proper_subtypes(classes, base_type) - _LOGGER.debug("%d %s subtype(s): %s", len(proper_subtypes), + _LOGGER.debug("%d proper %s subtype(s): %s", len(proper_subtypes), base_type.__name__, class_names(proper_subtypes)) # Determine course of action based on subtype request and number found. diff --git a/tests/models/independent/test_ProtocolInterface.py b/tests/models/independent/test_ProtocolInterface.py index bd0d7dcb..4c673dbb 100644 --- a/tests/models/independent/test_ProtocolInterface.py +++ b/tests/models/independent/test_ProtocolInterface.py @@ -24,6 +24,7 @@ SUBTYPES_KEY = ProtocolInterface.SUBTYPE_MAPPING_SECTION ATAC_PROTOCOL_NAME = "ATAC" +SAMPLE_IMPORT = "from looper.models import Sample" class CustomExceptionA(Exception): @@ -453,6 +454,7 @@ def test_subtype_is_not_Sample( # Write out pipeline module file with non-Sample class definition. lines = _class_definition_lines(subtype_name, name_super_type="object") with open(pipe_path, 'w') as pipe_module_file: + pipe_module_file.write("{}\n\n".format(SAMPLE_IMPORT)) for l in lines: pipe_module_file.write(l) @@ -480,6 +482,7 @@ def test_subtype_not_implemented( lines = _class_definition_lines("Decoy", "object") \ if decoy_class else [] with open(pipe_path, 'w') as modfile: + modfile.write("{}\n\n".format(SAMPLE_IMPORT)) for l in lines: modfile.write(l) conf_path = _write_config_data( @@ -505,7 +508,7 @@ def test_matches_sample_subtype( decoy_proto = "DECOY" # Update the ProtocolInterface data and write it out. - atacseq_piface_data[SUBTYPES_KEY] = { + atacseq_piface_data[atac_pipe_name][SUBTYPES_KEY] = { ATAC_PROTOCOL_NAME: subtype_name, decoy_proto: decoy_class} conf_path = _write_config_data( protomap={ATAC_PROTOCOL_NAME: atac_pipe_name, @@ -518,6 +521,7 @@ def test_matches_sample_subtype( for lines_order in itertools.permutations([legit_lines, decoy_lines]): with open(pipe_path, 'w') as pipe_mod_file: + pipe_mod_file.write("{}\n\n".format(SAMPLE_IMPORT)) for class_lines in lines_order: for line in class_lines: pipe_mod_file.write(line) @@ -543,16 +547,18 @@ def test_subtypes_list( # Define the classes, writing them in the pipeline module file. subtype_names = ["ArbitraryA", "PlaceholderB"] with open(pipe_path, 'w') as pipe_module_file: + pipe_module_file.write("{}\n\n".format(SAMPLE_IMPORT)) for subtype_name in subtype_names: # Have the classes be Sample subtypes. for line in _class_definition_lines( subtype_name, name_super_type=Sample.__name__): pipe_module_file.write(line) + pipe_module_file.write("\n\n") # Update the ProtocolInterface data. subtype_section = subtype_names if spec_type == "single" \ else {ATAC_PROTOCOL_NAME: subtype_names} - atacseq_piface_data[SUBTYPES_KEY] = subtype_section + atacseq_piface_data[atac_pipe_name][SUBTYPES_KEY] = subtype_section # Create the ProtocolInterface. conf_path = _write_config_data( @@ -568,35 +574,42 @@ def test_subtypes_list( @pytest.mark.parametrize( - argnames="target", argvalues=["middle", "bottom"]) + argnames="target", argvalues=["Leaf", "Middle"]) @pytest.mark.parametrize( argnames="spec_type", argvalues=["single", "mapping"]) def test_sample_grandchild( self, tmpdir, spec_type, target, - atacseq_piface_data, name_atac_pipe): + atacseq_piface_data, atac_pipe_name): """ The subtype to be used can be a grandchild of Sample. """ - pipe_path = os.path.join(tmpdir.strpath, name_atac_pipe) + + pipe_path = os.path.join(tmpdir.strpath, atac_pipe_name) intermediate_sample_subtype = "Middle" leaf_sample_subtype = "Leaf" + intermediate_subtype_lines = _class_definition_lines( intermediate_sample_subtype, Sample.__name__) leaf_subtype_lines = _class_definition_lines( leaf_sample_subtype, intermediate_sample_subtype) with open(pipe_path, 'w') as pipe_mod_file: + pipe_mod_file.write("{}\n\n".format(SAMPLE_IMPORT)) for l in intermediate_subtype_lines: pipe_mod_file.write(l) pipe_mod_file.write("\n\n") for l in leaf_subtype_lines: pipe_mod_file.write(l) - atacseq_piface_data[SUBTYPES_KEY] = target if spec_type == "single" \ - else {ATAC_PROTOCOL_NAME: target} + + atacseq_piface_data[atac_pipe_name][SUBTYPES_KEY] = \ + target if spec_type == "single" else \ + {ATAC_PROTOCOL_NAME: target} conf_path = _write_config_data( - protomap={ATAC_PROTOCOL_NAME: name_atac_pipe}, + protomap={ATAC_PROTOCOL_NAME: atac_pipe_name}, conf_data=atacseq_piface_data, dirpath=tmpdir.strpath) + piface = ProtocolInterface(conf_path) subtype = piface.fetch_sample_subtype( - protocol=ATAC_PROTOCOL_NAME, strict_pipe_key=name_atac_pipe, + protocol=ATAC_PROTOCOL_NAME, strict_pipe_key=atac_pipe_name, full_pipe_path=pipe_path) + assert target == subtype.__name__ @@ -635,7 +648,7 @@ def atacseq_piface_data_with_subtypes( def _class_definition_lines(name, name_super_type): """ Create lines that define a class. """ - return ["class {t}({st}):\n".format(name), + return ["class {t}({st}):\n".format(t=name, st=name_super_type), "\tdef __init__(self, *args, **kwarggs):\n", "\t\tsuper({t}, self).__init__(*args, **kwargs)".format( t=name, st=name_super_type)] @@ -650,9 +663,8 @@ def _create_module(lines_by_class, filepath): :param str filepath: path to module file to create :return str: path to the module file written """ - header = "from looper.models import Sample" lines = "\n\n".join( - [header] + ["\n".join(class_lines) + [SAMPLE_IMPORT] + ["\n".join(class_lines) for class_lines in lines_by_class]) with open(filepath, 'w') as modfile: modfile.write("{}\n".format(lines)) From 65b2efcec4e1cc4ba72e13e1ed509c3f171f5a03 Mon Sep 17 00:00:00 2001 From: Vince Reuter Date: Tue, 27 Jun 2017 11:38:43 -0400 Subject: [PATCH 92/94] broader fixture sharing --- tests/models/conftest.py | 174 ++++++++++++++++++++++++++- tests/models/independent/conftest.py | 173 -------------------------- 2 files changed, 172 insertions(+), 175 deletions(-) delete mode 100644 tests/models/independent/conftest.py diff --git a/tests/models/conftest.py b/tests/models/conftest.py index 410c6303..afc44bf4 100644 --- a/tests/models/conftest.py +++ b/tests/models/conftest.py @@ -1,8 +1,17 @@ -""" Models' tests' configuration. """ +""" Configuration for modules with independent tests of models. """ from collections import OrderedDict -import pytest +import copy +import sys +if sys.version_info < (3, 3): + from collections import Iterable, Mapping +else: + from collections.abc import Iterable, Mapping + import pandas as pd +import pytest + +from looper.models import DEFAULT_COMPUTE_RESOURCES_NAME __author__ = "Vince Reuter" @@ -32,6 +41,111 @@ """ +# Compute resource bundles for pipeline interface configuration data +DEFAULT_RESOURCES = {"file_size": 0, "cores": 1, "mem": 8000, + "time": "0-01:00:00", "partition": "local"} +MIDSIZE_RESOURCES = {"file_size": 10, "cores": 8, "mem": 16000, + "time": "0-07:00:00", "partition": "serial"} +HUGE_RESOURCES = {"file_size": 30, "cores": 24, "mem": 64000, + "time": "30-00:00:00", "partition": "longq"} + + + +def pytest_generate_tests(metafunc): + """ Conditional customization of test cases in this directory. """ + try: + classname = metafunc.cls.__name__ + except AttributeError: + # Some functions don't belong to a class. + pass + else: + if classname == "ConstructorPathParsingTests": + # Provide test case with two PipelineInterface config bundles. + metafunc.parametrize( + argnames="config_bundles", + argvalues=[(atacseq_iface_without_resources(), + {"name": "sans-path"})]) + + + +@pytest.fixture(scope="function") +def atacseq_iface_without_resources(): + """ + Provide the ATAC-Seq pipeline interface as a fixture, without resources. + + Note that this represents the configuration data for the interface for a + single pipeline. In order to use this in the form that a PipelineInterface + expects, this needs to be the value to which a key is mapped within a + larger Mapping. + + :return Mapping: all of the pipeline interface configuration data for + ATAC-Seq, minus the resources section + """ + return { + "name": "ATACseq", + "looper_args": True, + "required_input_files": ["read1", "read2"], + "all_input_files": ["read1", "read2"], + "ngs_input_files": ["read1", "read2"], + "arguments": { + "--sample-name": "sample_name", + "--genome": "genome", + "--input": "read1", + "--input2": "read2", + "--single-or-paired": "read_type" + }, + "optional_arguments": { + "--frip-ref-peaks": "FRIP_ref", + "--prealignments": "prealignments", + "--genome-size": "macs_genome_size" + } + } + + + +@pytest.fixture(scope="function") +def atac_pipe_name(): + return "ATACSeq.py" + + + +@pytest.fixture(scope="function") +def atacseq_iface_with_resources( + atacseq_iface_without_resources, resources): + """ + + :param dict atacseq_iface_without_resources: PipelineInterface config + data, minus a resources section + :param Mapping resources: resources section of PipelineInterface + configuration data + :return Mapping: pipeline interface data for ATAC-Seq pipeline, with all + of the base sections plus resources section + """ + iface_data = copy.deepcopy(atacseq_iface_without_resources) + iface_data["resources"] = copy.deepcopy(resources) + return iface_data + + + +@pytest.fixture(scope="function") +def atacseq_piface_data(atacseq_iface_with_resources, atac_pipe_name): + """ + Provide a test case with data for an ATACSeq PipelineInterface. + + :param str atac_pipe_name: name/key for the pipeline to which the + interface data pertains + :return dict: configuration data needed to create PipelineInterface + """ + return {atac_pipe_name: copy.deepcopy(atacseq_iface_with_resources)} + + + +@pytest.fixture(scope="function") +def default_resources(): + """ Provide test case with default PipelineInterface resources section. """ + return copy.deepcopy(DEFAULT_RESOURCES) + + @pytest.fixture(scope="function") def env_config_filepath(tmpdir): @@ -42,6 +156,20 @@ def env_config_filepath(tmpdir): +@pytest.fixture(scope="function") +def huge_resources(): + """ Provide non-default resources spec. section for PipelineInterface. """ + return copy.deepcopy(HUGE_RESOURCES) + + + +@pytest.fixture(scope="function") +def midsize_resources(): + """ Provide non-default resources spec. section for PipelineInterface. """ + return copy.deepcopy(MIDSIZE_RESOURCES) + + + @pytest.fixture(scope="function") def minimal_project_conf_path(tmpdir): """ Write minimal sample annotations and project configuration. """ @@ -55,3 +183,45 @@ def minimal_project_conf_path(tmpdir): "metadata:\n sample_annotation: {}".format(anns_file) conf_file.write(config_lines) return conf_file.strpath + + + +@pytest.fixture(scope="function") +def piface_config_bundles(request, resources): + """ + Provide the ATAC-Seq pipeline interface as a fixture, including resources. + + Note that this represents the configuration data for the interface for a + single pipeline. In order to use this in the form that a PipelineInterface + expects, this needs to be the value to which a key is mapped within a + larger Mapping. + + :param pytest._pytest.fixtures.SubRequest request: hook into test case + requesting this fixture, which is queried for a resources value with + which to override the default if it's present. + :param Mapping resources: pipeline interface resource specification + :return Iterable[Mapping]: collection of bundles of pipeline interface + configuration bundles + """ + iface_config_datas = request.getfixturevalue("config_bundles") + if isinstance(iface_config_datas, Mapping): + data_bundles = iface_config_datas.values() + elif isinstance(iface_config_datas, Iterable): + data_bundles = iface_config_datas + else: + raise TypeError("Expected mapping or list collection of " + "PipelineInterface data: {} ({})".format( + iface_config_datas, type(iface_config_datas))) + resource_specification = request.getfixturevalue("resources") \ + if "resources" in request.fixturenames else resources + for config_bundle in data_bundles: + config_bundle.update(resource_specification) + return iface_config_datas + + + +@pytest.fixture(scope="function") +def resources(): + """ Basic PipelineInterface compute resources data. """ + return {DEFAULT_COMPUTE_RESOURCES_NAME: copy.deepcopy(DEFAULT_RESOURCES), + "huge": copy.copy(HUGE_RESOURCES)} diff --git a/tests/models/independent/conftest.py b/tests/models/independent/conftest.py deleted file mode 100644 index b32c1609..00000000 --- a/tests/models/independent/conftest.py +++ /dev/null @@ -1,173 +0,0 @@ -""" Configuration for modules with independent tests of models. """ - -import copy -import sys -if sys.version_info < (3, 3): - from collections import Iterable, Mapping -else: - from collections.abc import Iterable, Mapping -import pytest -from looper.models import DEFAULT_COMPUTE_RESOURCES_NAME - - -__author__ = "Vince Reuter" -__email__ = "vreuter@virginia.edu" - - - -# Compute resource bundles for pipeline interface configuration data -DEFAULT_RESOURCES = {"file_size": 0, "cores": 1, "mem": 8000, - "time": "0-01:00:00", "partition": "local"} -MIDSIZE_RESOURCES = {"file_size": 10, "cores": 8, "mem": 16000, - "time": "0-07:00:00", "partition": "serial"} -HUGE_RESOURCES = {"file_size": 30, "cores": 24, "mem": 64000, - "time": "30-00:00:00", "partition": "longq"} - - - -def pytest_generate_tests(metafunc): - """ Conditional customization of test cases in this directory. """ - try: - classname = metafunc.cls.__name__ - except AttributeError: - # Some functions don't belong to a class. - pass - else: - if classname == "ConstructorPathParsingTests": - # Provide test case with two PipelineInterface config bundles. - metafunc.parametrize( - argnames="config_bundles", - argvalues=[(atacseq_iface_without_resources(), - {"name": "sans-path"})]) - - - -@pytest.fixture(scope="function") -def atacseq_iface_without_resources(): - """ - Provide the ATAC-Seq pipeline interface as a fixture, without resources. - - Note that this represents the configuration data for the interface for a - single pipeline. In order to use this in the form that a PipelineInterface - expects, this needs to be the value to which a key is mapped within a - larger Mapping. - - :return Mapping: all of the pipeline interface configuration data for - ATAC-Seq, minus the resources section - """ - return { - "name": "ATACseq", - "looper_args": True, - "required_input_files": ["read1", "read2"], - "all_input_files": ["read1", "read2"], - "ngs_input_files": ["read1", "read2"], - "arguments": { - "--sample-name": "sample_name", - "--genome": "genome", - "--input": "read1", - "--input2": "read2", - "--single-or-paired": "read_type" - }, - "optional_arguments": { - "--frip-ref-peaks": "FRIP_ref", - "--prealignments": "prealignments", - "--genome-size": "macs_genome_size" - } - } - - - -@pytest.fixture(scope="function") -def atac_pipe_name(): - return "ATACSeq.py" - - - -@pytest.fixture(scope="function") -def atacseq_iface_with_resources( - atacseq_iface_without_resources, resources): - """ - - :param dict atacseq_iface_without_resources: PipelineInterface config - data, minus a resources section - :param Mapping resources: resources section of PipelineInterface - configuration data - :return Mapping: pipeline interface data for ATAC-Seq pipeline, with all - of the base sections plus resources section - """ - iface_data = copy.deepcopy(atacseq_iface_without_resources) - iface_data["resources"] = copy.deepcopy(resources) - return iface_data - - - -@pytest.fixture(scope="function") -def atacseq_piface_data(atacseq_iface_with_resources, atac_pipe_name): - """ - Provide a test case with data for an ATACSeq PipelineInterface. - - :param str atac_pipe_name: name/key for the pipeline to which the - interface data pertains - :return dict: configuration data needed to create PipelineInterface - """ - return {atac_pipe_name: copy.deepcopy(atacseq_iface_with_resources)} - - - -@pytest.fixture(scope="function") -def default_resources(): - return copy.deepcopy(DEFAULT_RESOURCES) - - - -@pytest.fixture(scope="function") -def huge_resources(): - return copy.deepcopy(HUGE_RESOURCES) - - - -@pytest.fixture(scope="function") -def midsize_resources(): - return copy.deepcopy(MIDSIZE_RESOURCES) - - - -@pytest.fixture(scope="function") -def piface_config_bundles(request, resources): - """ - Provide the ATAC-Seq pipeline interface as a fixture, including resources. - - Note that this represents the configuration data for the interface for a - single pipeline. In order to use this in the form that a PipelineInterface - expects, this needs to be the value to which a key is mapped within a - larger Mapping. - - :param pytest._pytest.fixtures.SubRequest request: hook into test case - requesting this fixture, which is queried for a resources value with - which to override the default if it's present. - :param Mapping resources: pipeline interface resource specification - :return Iterable[Mapping]: collection of bundles of pipeline interface - configuration bundles - """ - iface_config_datas = request.getfixturevalue("config_bundles") - if isinstance(iface_config_datas, Mapping): - data_bundles = iface_config_datas.values() - elif isinstance(iface_config_datas, Iterable): - data_bundles = iface_config_datas - else: - raise TypeError("Expected mapping or list collection of " - "PipelineInterface data: {} ({})".format( - iface_config_datas, type(iface_config_datas))) - resources = request.getfixturevalue("resources") \ - if "resources" in request.fixturenames else resources - for config_bundle in data_bundles: - config_bundle.update(resources) - return iface_config_datas - - - -@pytest.fixture(scope="function") -def resources(): - """ Basic PipelineInterface compute resources data. """ - return {DEFAULT_COMPUTE_RESOURCES_NAME: copy.deepcopy(DEFAULT_RESOURCES), - "huge": copy.copy(HUGE_RESOURCES)} From f7cc27e97dd7c211ffe27511ea0f24a72c96e748 Mon Sep 17 00:00:00 2001 From: Vince Reuter Date: Tue, 27 Jun 2017 16:41:54 -0400 Subject: [PATCH 93/94] fixing the smokes and adding sample sheet tests --- looper/models.py | 88 ++++++----- tests/conftest.py | 58 ++++++- tests/models/conftest.py | 73 ++++++++- tests/models/independent/test_Project.py | 8 + .../test_Project_Sample_interaction.py | 146 +++++++++++++++++- tests/models/test_models_smoke.py | 125 +++++++++------ 6 files changed, 405 insertions(+), 93 deletions(-) diff --git a/looper/models.py b/looper/models.py index 644eb673..e0ef9390 100644 --- a/looper/models.py +++ b/looper/models.py @@ -106,7 +106,6 @@ def check_sheet(sample_file, dtype=str): :raises IOError: if given annotations file can't be read. :raises ValueError: if required column(s) is/are missing. """ - df = _pd.read_table(sample_file, sep=None, dtype=dtype, index_col=False, engine="python") req = [SAMPLE_NAME_COLNAME] @@ -511,6 +510,9 @@ def __len__(self): def __repr__(self): return repr(self.__dict__) + def __str__(self): + return "{}: {}".format(self.__class__.__name__, repr(self)) + @copy @@ -1048,7 +1050,8 @@ def include(sample): def include(_): return True - return _pd.DataFrame([s for s in self.samples if include(s)]) + return _pd.DataFrame( + [s.as_series() for s in self.samples if include(s)]) def make_project_dirs(self): @@ -1452,6 +1455,14 @@ def __init__(self, series): self.paths = Paths() + def __eq__(self, other): + return self.__dict__ == other.__dict__ + + + def __ne__(self, other): + return not self == other + + def __getitem__(self, item): """ Provides dict-style access to attributes @@ -2460,7 +2471,8 @@ def __init__(self, interface_data_source): else: raise ValueError("Alleged pipelines location '{}' exists neither " - "as a file nor as a folder.".format(interface_data_source)) + "as a file nor as a folder.". + format(interface_data_source)) def __repr__(self): @@ -2549,41 +2561,6 @@ def fetch_sample_subtype( return subtype - @classmethod - def _parse_iface_data(cls, pipe_iface_data): - """ - Parse data from mappings to set instance attributes. - - The data that define a ProtocolInterface are a "protocol_mapping" - Mapping and a "pipelines" Mapping, which are used to create a - ProtocolMapper and a PipelineInterface, representing the configuration - data for pipeline(s) from a single location. There are a couple of - different ways (file, folder, and eventually, raw Mapping) to provide - this data, and this function provides some standardization to how - those data are processed, independent of input type/format. - - :param Mapping[str, Mapping] pipe_iface_data: mapping from section - name to section data mapping; more specifically, the protocol - mappings Mapping and the PipelineInterface mapping - :return list[(str, ProtocolMapper | PipelineInterface)]: pairs of - attribute name for the ProtocolInterface being created, and the - value for that attribute, - """ - assignments = [("protocol_mapping", ProtocolMapper, "protomap"), - ("pipelines", PipelineInterface, "pipe_iface")] - attribute_values = [] - for section_name, data_type, attr_name in assignments: - try: - data = pipe_iface_data[section_name] - except KeyError: - _LOGGER.error("Error creating %s from data: %s", - cls.__name__, str(pipe_iface_data)) - raise Exception("PipelineInterface file lacks section: '{}'". - format(section_name)) - attribute_values.append((attr_name, data_type(data))) - return attribute_values - - def finalize_pipeline_key_and_paths(self, pipeline_key): """ Determine pipeline's full path, arguments, and strict key. @@ -2632,6 +2609,41 @@ def finalize_pipeline_key_and_paths(self, pipeline_key): return strict_pipeline_key, script_path_only, script_path_with_flags + @classmethod + def _parse_iface_data(cls, pipe_iface_data): + """ + Parse data from mappings to set instance attributes. + + The data that define a ProtocolInterface are a "protocol_mapping" + Mapping and a "pipelines" Mapping, which are used to create a + ProtocolMapper and a PipelineInterface, representing the configuration + data for pipeline(s) from a single location. There are a couple of + different ways (file, folder, and eventually, raw Mapping) to provide + this data, and this function provides some standardization to how + those data are processed, independent of input type/format. + + :param Mapping[str, Mapping] pipe_iface_data: mapping from section + name to section data mapping; more specifically, the protocol + mappings Mapping and the PipelineInterface mapping + :return list[(str, ProtocolMapper | PipelineInterface)]: pairs of + attribute name for the ProtocolInterface being created, and the + value for that attribute, + """ + assignments = [("protocol_mapping", ProtocolMapper, "protomap"), + ("pipelines", PipelineInterface, "pipe_iface")] + attribute_values = [] + for section_name, data_type, attr_name in assignments: + try: + data = pipe_iface_data[section_name] + except KeyError: + _LOGGER.error("Error creating %s from data: %s", + cls.__name__, str(pipe_iface_data)) + raise Exception("PipelineInterface file lacks section: '{}'". + format(section_name)) + attribute_values.append((attr_name, data_type(data))) + return attribute_values + + @copy class ProtocolMapper(Mapping): diff --git a/tests/conftest.py b/tests/conftest.py index 46c73631..4765186b 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -7,6 +7,7 @@ """ +import copy import logging import os import shutil @@ -19,8 +20,7 @@ import yaml from looper import setup_looper_logger -from looper.models import PipelineInterface -from looper.loodels import Project +from looper.models import PipelineInterface, Project, SAMPLE_NAME_COLNAME _LOGGER = logging.getLogger("looper") @@ -173,6 +173,18 @@ } COMPARISON_FUNCTIONS = ["__eq__", "__ne__", "__len__", "keys", "values", "items"] +COLUMNS = [SAMPLE_NAME_COLNAME, "val1", "val2", "library"] +PROJECT_CONFIG_DATA = {"metadata": {"sample_annotation": "annotations.csv"}} + + + +def update_project_conf_data(extension): + """ Updated Project configuration data mapping based on file extension """ + updated = copy.deepcopy(PROJECT_CONFIG_DATA) + filename = updated["metadata"]["sample_annotation"] + base, _ = os.path.splitext(filename) + updated["metadata"]["sample_annotation"] = "{}.{}".format(base, extension) + return updated @@ -196,12 +208,6 @@ def pytest_generate_tests(metafunc): -@pytest.fixture(scope="function") -def sample_annotation_lines(): - return SAMPLE_ANNOTATION_LINES - - - @pytest.fixture(scope="session", autouse=True) def conf_logs(request): """ Configure logging for the testing session. """ @@ -215,6 +221,42 @@ def conf_logs(request): + +@pytest.fixture(scope="function") +def sample_annotation_lines(): + return SAMPLE_ANNOTATION_LINES + + + +@pytest.fixture(scope="function") +def path_empty_project(request, tmpdir): + """ Provide path to Project config file with empty annotations. """ + + # Determine how to write the data and how to name a file. + if "delimiter" in request.fixturenames: + delimiter = request.getfixturevalue("delimiter") + extension = "txt" + else: + delimiter = "," + extension = "csv" + + # Update the Project configuration data. + conf_data = update_project_conf_data(extension) + + # Write the needed files. + anns_path = os.path.join( + tmpdir.strpath, conf_data["metadata"]["sample_annotation"]) + + with open(anns_path, 'w') as anns_file: + anns_file.write(delimiter.join(COLUMNS)) + conf_path = os.path.join(tmpdir.strpath, "proj-conf.yaml") + with open(conf_path, 'w') as conf_file: + yaml.dump(conf_data, conf_file) + + return conf_path + + + def interactive(prj_lines=PROJECT_CONFIG_LINES, iface_lines=PIPELINE_INTERFACE_CONFIG_LINES, merge_table_lines = MERGE_TABLE_LINES, diff --git a/tests/models/conftest.py b/tests/models/conftest.py index afc44bf4..514ffd53 100644 --- a/tests/models/conftest.py +++ b/tests/models/conftest.py @@ -2,6 +2,7 @@ from collections import OrderedDict import copy +import os import sys if sys.version_info < (3, 3): from collections import Iterable, Mapping @@ -10,8 +11,9 @@ import pandas as pd import pytest +import yaml -from looper.models import DEFAULT_COMPUTE_RESOURCES_NAME +from looper.models import DEFAULT_COMPUTE_RESOURCES_NAME, SAMPLE_NAME_COLNAME __author__ = "Vince Reuter" @@ -40,6 +42,7 @@ submission_command: sh """ +BASIC_PROTOMAP = {"ATAC": "ATACSeq.py"} # Compute resource bundles for pipeline interface configuration data DEFAULT_RESOURCES = {"file_size": 0, "cores": 1, "mem": 8000, @@ -105,6 +108,7 @@ def atacseq_iface_without_resources(): @pytest.fixture(scope="function") def atac_pipe_name(): + """ Oft-used as filename for pipeline module and PipelineInterface key. """ return "ATACSeq.py" @@ -140,6 +144,39 @@ def atacseq_piface_data(atacseq_iface_with_resources, atac_pipe_name): +@pytest.fixture(scope="function") +def basic_data_raw(): + return copy.deepcopy({ + "AttributeDict": {}, "ProtocolMapper": BASIC_PROTOMAP, + "Sample": {SAMPLE_NAME_COLNAME: "arbitrary-sample"}}) + + + +@pytest.fixture(scope="function") +def basic_instance_data(request, instance_raw_data): + """ + Transform the raw data for a basic model instance to comply with its ctor. + + :param pytest._pytest.fixtures.SubRequest request: test case requesting + the basic instance data + :param Mapping instance_raw_data: the raw data needed to create a + model instance + :return object: basic instance data in a form accepted by its constructor + """ + # Cleanup is free with _write_config, using request's temp folder. + transformation_by_class = { + "AttributeDict": lambda data: data, + "PipelineInterface": lambda data: + _write_config(data, request, "pipeline_interface.yaml"), + "ProtocolInterface": lambda data: + _write_config(data, request, "pipeline_interface.yaml"), + "ProtocolMapper": lambda data: data, + "Sample": lambda data: pd.Series(data)} + which_class = request.getfixturevalue("class_name") + return transformation_by_class[which_class](instance_raw_data) + + + @pytest.fixture(scope="function") def default_resources(): """ Provide test case with default PipelineInterface resources section. """ @@ -163,6 +200,21 @@ def huge_resources(): +@pytest.fixture(scope="function") +def instance_raw_data(request, basic_data_raw, atacseq_piface_data): + """ Supply the raw data for a basic model instance as a fixture. """ + which_class = request.getfixturevalue("class_name") + if which_class == "PipelineInterface": + return copy.deepcopy(atacseq_piface_data) + elif which_class == "ProtocolInterface": + return {"protocol_mapping": + copy.deepcopy(basic_data_raw["ProtocolMapper"]), + "pipelines": copy.deepcopy(atacseq_piface_data)} + else: + return copy.deepcopy(basic_data_raw[which_class]) + + + @pytest.fixture(scope="function") def midsize_resources(): """ Provide non-default resources spec. section for PipelineInterface. """ @@ -225,3 +277,22 @@ def resources(): """ Basic PipelineInterface compute resources data. """ return {DEFAULT_COMPUTE_RESOURCES_NAME: copy.deepcopy(DEFAULT_RESOURCES), "huge": copy.copy(HUGE_RESOURCES)} + + + +def _write_config(data, request, filename): + """ + Write configuration data to file. + + :param str Sequence | Mapping data: data to write to file, YAML compliant + :param pytest._pytest.fixtures.SubRequest request: test case that + requested a fixture from which this function was called + :param str filename: name for the file to write + :return str: full path to the file written + """ + # We get cleanup for free by writing to file in requests temp folder. + dirpath = request.getfixturevalue("tmpdir").strpath + filepath = os.path.join(dirpath, filename) + with open(filepath, 'w') as conf_file: + yaml.safe_dump(data, conf_file) + return filepath diff --git a/tests/models/independent/test_Project.py b/tests/models/independent/test_Project.py index 4d3fe5f8..f5ff1ba4 100644 --- a/tests/models/independent/test_Project.py +++ b/tests/models/independent/test_Project.py @@ -46,6 +46,14 @@ class ProjectConstructorTests: """ Tests of Project constructor, particularly behavioral details. """ + def test_no_samples(self, path_empty_project): + """ Lack of Samples is unproblematic. """ + p = Project(path_empty_project) + assert 0 == p.num_samples + assert [] == list(p.samples) + + + @pytest.mark.parametrize( argnames="spec_type", argvalues=["as_null", "missing"], ids=lambda spec: "spec_type={}".format(spec)) diff --git a/tests/models/integration/test_Project_Sample_interaction.py b/tests/models/integration/test_Project_Sample_interaction.py index 40ebf7f3..f30aa2d1 100644 --- a/tests/models/integration/test_Project_Sample_interaction.py +++ b/tests/models/integration/test_Project_Sample_interaction.py @@ -1,11 +1,18 @@ """ Tests for interaction between a Project and a Sample. """ from collections import OrderedDict +import copy +import itertools import os +import random + import pandas as pd import pytest import yaml -from looper.models import Project, SAMPLE_ANNOTATIONS_KEY + +from looper.models import \ + merge_sample, Project, Sample, \ + SAMPLE_ANNOTATIONS_KEY, SAMPLE_NAME_COLNAME __author__ = "Vince Reuter" @@ -22,6 +29,143 @@ "input_dir": "dummy/sequencing/data", "tools_folder": "arbitrary-seq-tools-folder"} +NAME_ANNOTATIONS_FILE = "annotations.csv" +SAMPLE_NAMES = ["WGBS_mm10", "ATAC_mm10", "WGBS_rn6", "ATAC_rn6"] +COLUMNS = [SAMPLE_NAME_COLNAME, "val1", "val2", "library"] +VALUES1 = [random.randint(-5, 5) for _ in range(len(SAMPLE_NAMES))] +VALUES2 = [random.randint(-5, 5) for _ in range(len(SAMPLE_NAMES))] +LIBRARIES = ["WGBS", "ATAC", "WGBS", "ATAC"] +DATA = list(zip(SAMPLE_NAMES, VALUES1, VALUES2, LIBRARIES)) +DATA_FOR_SAMPLES = [ + {SAMPLE_NAME_COLNAME: SAMPLE_NAMES}, + {"val1": VALUES1}, {"val2": VALUES2}, {"library": LIBRARIES}] +PROJECT_CONFIG_DATA = {"metadata": {"sample_annotation": NAME_ANNOTATIONS_FILE}} +PROTOCOLS = ["WGBS", "ATAC"] + + + +def pytest_generate_tests(metafunc): + """ Customization of test cases within this module. """ + if metafunc.cls == BuildSheetTests: + if "protocols" in metafunc.fixturenames: + # Apply the test case to each of the possible combinations of + # protocols, from none at all up to all of them. + metafunc.parametrize( + argnames="protocols", + argvalues=list(itertools.chain.from_iterable( + itertools.combinations(PROTOCOLS, x) + for x in range(1 + len(PROTOCOLS))))) + if "delimiter" in metafunc.fixturenames: + metafunc.parametrize(argnames="delimiter", argvalues=[",", "\t"]) + + + +@pytest.fixture(scope="function") +def proj_conf(): + """ Provide the basic configuration data. """ + return copy.deepcopy(PROJECT_CONFIG_DATA) + + + +@pytest.fixture(scope="function") +def path_proj_conf_file(tmpdir, proj_conf): + """ Write basic project configuration data and provide filepath. """ + conf_path = os.path.join(tmpdir.strpath, "project_config.yaml") + with open(conf_path, 'w') as conf: + yaml.safe_dump(proj_conf, conf) + return conf_path + + + +@pytest.fixture(scope="function") +def path_anns_file(request, tmpdir, sample_sheet): + """ Write basic annotations, optionally using a different delimiter. """ + filepath = os.path.join(tmpdir.strpath, NAME_ANNOTATIONS_FILE) + if "delimiter" in request.fixturenames: + delimiter = request.getfixturevalue("delimiter") + else: + delimiter = "," + with open(filepath, 'w') as anns_file: + sample_sheet.to_csv(anns_file, sep=delimiter, index=False) + return filepath + + + +@pytest.fixture(scope="function") +def samples_rawdata(): + return copy.deepcopy(DATA_FOR_SAMPLES) + + + +@pytest.fixture(scope="function") +def sample_sheet(samples_rawdata): + return pd.DataFrame(samples_rawdata) + + + +class BuildSheetTests: + """ Tests for construction of sheet of Project's Samples. """ + + # Note: seemingly unused parameters may affect parameterization + # logic of other fixtures used by a test case; tread lightly. + + + def test_no_samples(self, protocols, delimiter, path_empty_project): + """ Lack of Samples is unproblematic for the sheet build. """ + # Regardless of protocol(s), the sheet should be empty. + p = Project(path_empty_project) + sheet = p.build_sheet(*protocols) + assert sheet.empty + + + @pytest.mark.parametrize( + argnames="which_sample_index", argvalues=range(len(SAMPLE_NAMES))) + def test_single_sample( + self, tmpdir, path_proj_conf_file, which_sample_index): + """ Single Sample is perfectly valid for Project and sheet. """ + + # Pull out the values for the current sample. + values = DATA[which_sample_index] + + # Write the annotations. + anns_path = os.path.join(tmpdir.strpath, NAME_ANNOTATIONS_FILE) + with open(anns_path, 'w') as anns_file: + anns_file.write("{}\n".format(",".join(COLUMNS))) + anns_file.write("{}\n".format(",".join([str(v) for v in values]))) + + # Build the sheet. + p = Project(path_proj_conf_file) + sheet = p.build_sheet() + + # It should be a single-row DataFrame. + assert isinstance(sheet, pd.DataFrame) + assert 1 == len(sheet) + assert 1 == p.num_samples + + # There will be additional values added from the Project, + # but the core data values will have remained the same. + sample = list(p.samples)[0] + for attr, exp_val in zip(COLUMNS, values): + obs_val = getattr(sample, attr) + try: + assert exp_val == obs_val + except AssertionError as e: + try: + assert exp_val == int(obs_val) + except AssertionError: + raise e + + + def test_multiple_samples( + self, protocols, path_anns_file, path_proj_conf_file): + """ Project also processes multiple Sample fine. """ + pass + + + def test_samples_are_generic(self, path_anns_file, path_proj_conf_file): + """ Regardless of protocol, Samples for sheet are generic. """ + pass + class SampleFolderCreationTests: diff --git a/tests/models/test_models_smoke.py b/tests/models/test_models_smoke.py index ec06fd6d..83c07879 100644 --- a/tests/models/test_models_smoke.py +++ b/tests/models/test_models_smoke.py @@ -1,10 +1,9 @@ """ Basic smoketests for models """ -import inspect import logging import pytest import looper -from looper.models import AttributeDict +from looper.models import AttributeDict, Project __author__ = "Vince Reuter" @@ -17,66 +16,102 @@ def pytest_generate_tests(metafunc): """ Dynamic test case parameterization. """ - if metafunc.cls == AttributeDictRepresentationSmokeTests: - metafunc.parametrize(argnames="representation_method", - argvalues=["__repr__", "__str__"]) - elif metafunc.cls == ObjectRepresentationSmokeTests: - metafunc.parametrize(argnames="class_name", - argvalues=looper.models.__classes__) - metafunc.parametrize(argnames="method_name", argvalues=["__repr__"]) + if "funcname" in metafunc.fixturenames: + metafunc.parametrize( + argnames="funcname", argvalues=["__repr__", "__str__"]) -class ObjectRepresentationSmokeTests: +@pytest.mark.usefixtures("write_project_files") +class AttributeDictRepresentationTests: + """ Non-fail validation of AttributeDict representations. """ + + + @pytest.mark.parametrize( + argnames="data", + argvalues=[[('CO', 145)], {'CO': {"US-50": [550, 62, 145]}}]) + def test_AttributeDict_representations_smoke( + self, data, funcname): + """ Text representation of base AttributeDict doesn't fail. """ + attrdict = AttributeDict(data) + getattr(attrdict, funcname).__call__() + + + def test_Project_representations_smoke(self, proj, funcname): + """ Representation of Project (AttributeDict subclass) is failsafe. """ + getattr(proj, funcname).__call__() + + + def test_project_repr_name_inclusion(self, proj, funcname): + """ Test Project text representation. """ + func = getattr(proj, funcname) + result = func.__call__() + assert type(result) is str + classname = proj.__class__.__name__ + if funcname == "__str__": + assert classname in result + elif funcname == "__repr__": + assert classname not in result + else: + raise ValueError("Unexpected representation function: {}". + format(funcname)) + + + +class ModelCreationSmokeTests: + """ Smoketests for creation of various types of project-related models. """ + + # TODO: migrate these to pytest.raises(None) with 3.1. + + def test_empty_project(self, path_empty_project): + """ It's unproblematic to create a Project that lacks samples. """ + Project(path_empty_project) + + + +class ModelRepresentationSmokeTests: """ Tests for the text representation of important ADTs. """ + # NOTE: similar parameterization, but Project construction needs + # to be handled with greater care when testing the actual call. - def test_implements_repr_smoke(self, class_name, method_name): + @pytest.mark.parametrize( + argnames="class_name", argvalues=looper.models.__classes__) + def test_implements_repr_smoke(self, class_name): """ Each important ADT must implement a representation method. """ + funcname = "__repr__" + # Attempt a control assertion, that a subclass that doesn't override # the given method of its superclass, uses the superclass version of # the function in question. class ObjectSubclass(object): def __init__(self): super(ObjectSubclass, self).__init__() - try: - subclass_version = getattr(ObjectSubclass, "__repr__") - superclass_version = getattr(object, method_name) - except AttributeError: - _LOGGER.debug("No object subclass vs. object validation for " - "method: '%s'", method_name) - else: - assert subclass_version is superclass_version + assert getattr(ObjectSubclass, funcname) is getattr(object, funcname) # Make the actual assertion of interest. adt = getattr(looper.models, class_name) - assert getattr(adt, method_name) != \ - getattr(adt.__bases__[0], method_name) - - - def test_repr_smoke(self, class_name, method_name): - """ Object representation method successfully returns string. """ - # TODO: "with pytest.raises(None)..." in 3.1+ - assert str is type(getattr(class_name, method_name).__call__()) - - - -@pytest.mark.usefixtures("write_project_files") -class AttributeDictRepresentationSmokeTests: - """ Non-fail validation of AttributeDict representations. """ + assert getattr(adt, funcname) != \ + getattr(adt.__bases__[0], funcname) @pytest.mark.parametrize( - argnames="data", - argvalues=[[('CO', 145)], {'CO': {"US-50": [550, 62, 145]}}]) - def test_AttributeDict_representations( - self, data, representation_method): - """ Text representation of base AttributeDict doesn't fail. """ - attrdict = AttributeDict(data) - getattr(attrdict, representation_method).__call__() - - - def test_Project_representations(self, proj, representation_method): - """ Representation of Project (AttributeDict subclass) is failsafe. """ - getattr(proj, representation_method).__call__() + argnames="class_name", + argvalues=[cn for cn in looper.models.__classes__ + if cn != "Project"]) + def test_repr_smoke( + self, tmpdir, class_name, basic_instance_data, funcname): + """ Object representation method successfully returns string. """ + # Note that tmpdir is used when config file needs to be written. + cls = getattr(looper.models, class_name) + instance = cls(basic_instance_data) + func = getattr(instance, funcname) + result = func.__call__() + if funcname == "__str__": + assert class_name in result + elif funcname == "__repr__": + assert type(result) is str + else: + raise ValueError("Unexpected representation method: {}". + format(funcname)) From 966286b95804ee667cc62c6dcc27b542ac2b734b Mon Sep 17 00:00:00 2001 From: Vince Reuter Date: Tue, 27 Jun 2017 18:10:04 -0400 Subject: [PATCH 94/94] cleaner protocol-based selection in sheet construction; better test organization, all passing --- looper/models.py | 17 ++------ tests/models/conftest.py | 24 +++++++++++ tests/models/independent/test_Project.py | 1 + .../test_Project_Sample_interaction.py | 42 +++++++++++++++---- 4 files changed, 63 insertions(+), 21 deletions(-) diff --git a/looper/models.py b/looper/models.py index e0ef9390..6f7ed895 100644 --- a/looper/models.py +++ b/looper/models.py @@ -1038,20 +1038,11 @@ def build_sheet(self, *protocols): given, else all of this Project's samples """ # Use all protocols if none are explicitly specified. - protocols = set(protocols or self.protocols) - if protocols: - protocols = set(protocols) - def include(sample): - try: - return sample.library in protocols - except AttributeError: - return False - else: - def include(_): - return True - + samples = self.samples + protocols = {alpha_cased(p) for p in (protocols or self.protocols)} return _pd.DataFrame( - [s.as_series() for s in self.samples if include(s)]) + [s.as_series() for s in samples if + hasattr(s, "library") and alpha_cased(s.library) in protocols]) def make_project_dirs(self): diff --git a/tests/models/conftest.py b/tests/models/conftest.py index 514ffd53..3882d052 100644 --- a/tests/models/conftest.py +++ b/tests/models/conftest.py @@ -238,6 +238,30 @@ def minimal_project_conf_path(tmpdir): +@pytest.fixture(scope="function") +def path_proj_conf_file(tmpdir, proj_conf): + """ Write basic project configuration data and provide filepath. """ + conf_path = os.path.join(tmpdir.strpath, "project_config.yaml") + with open(conf_path, 'w') as conf: + yaml.safe_dump(proj_conf, conf) + return conf_path + + + +@pytest.fixture(scope="function") +def path_anns_file(request, tmpdir, sample_sheet): + """ Write basic annotations, optionally using a different delimiter. """ + filepath = os.path.join(tmpdir.strpath, "annotations.csv") + if "delimiter" in request.fixturenames: + delimiter = request.getfixturevalue("delimiter") + else: + delimiter = "," + with open(filepath, 'w') as anns_file: + sample_sheet.to_csv(anns_file, sep=delimiter, index=False) + return filepath + + + @pytest.fixture(scope="function") def piface_config_bundles(request, resources): """ diff --git a/tests/models/independent/test_Project.py b/tests/models/independent/test_Project.py index f5ff1ba4..3683a988 100644 --- a/tests/models/independent/test_Project.py +++ b/tests/models/independent/test_Project.py @@ -644,6 +644,7 @@ def _parse_flags_and_options(command_elements): return parsed_command_elements + def _write_project_config(config_data, dirpath, filename="proj-conf.yaml"): """ Write the configuration file for a Project. diff --git a/tests/models/integration/test_Project_Sample_interaction.py b/tests/models/integration/test_Project_Sample_interaction.py index f30aa2d1..12d0a7fb 100644 --- a/tests/models/integration/test_Project_Sample_interaction.py +++ b/tests/models/integration/test_Project_Sample_interaction.py @@ -11,8 +11,9 @@ import yaml from looper.models import \ - merge_sample, Project, Sample, \ + Project, Sample, \ SAMPLE_ANNOTATIONS_KEY, SAMPLE_NAME_COLNAME +from looper.utils import alpha_cased __author__ = "Vince Reuter" @@ -54,7 +55,9 @@ def pytest_generate_tests(metafunc): argnames="protocols", argvalues=list(itertools.chain.from_iterable( itertools.combinations(PROTOCOLS, x) - for x in range(1 + len(PROTOCOLS))))) + for x in range(1 + len(PROTOCOLS)))), + ids=lambda protos: + " protocols = {} ".format(",".join(protos))) if "delimiter" in metafunc.fixturenames: metafunc.parametrize(argnames="delimiter", argvalues=[",", "\t"]) @@ -93,13 +96,27 @@ def path_anns_file(request, tmpdir, sample_sheet): @pytest.fixture(scope="function") def samples_rawdata(): - return copy.deepcopy(DATA_FOR_SAMPLES) + return copy.deepcopy(DATA) @pytest.fixture(scope="function") def sample_sheet(samples_rawdata): - return pd.DataFrame(samples_rawdata) + df = pd.DataFrame(samples_rawdata) + df.columns = [SAMPLE_NAME_COLNAME, "val1", "val2", "library"] + return df + + + +def test_samples_are_generic(path_anns_file, path_proj_conf_file): + """ Regardless of protocol, Samples for sheet are generic. """ + # Annotations filepath fixture is also writes that file, so + # it's needed even though that return value isn't used locally. + p = Project(path_proj_conf_file) + assert len(SAMPLE_NAMES) == p.num_samples + samples = list(p.samples) + assert p.num_samples == len(samples) + assert all([Sample is type(s) for s in samples]) @@ -159,12 +176,21 @@ def test_single_sample( def test_multiple_samples( self, protocols, path_anns_file, path_proj_conf_file): """ Project also processes multiple Sample fine. """ - pass + p = Project(path_proj_conf_file) + + # Total sample count is constant. + assert len(SAMPLE_NAMES) == sum(1 for _ in p.samples) - def test_samples_are_generic(self, path_anns_file, path_proj_conf_file): - """ Regardless of protocol, Samples for sheet are generic. """ - pass + # But the sheet permits filtering to specific protocol(s). + exp_num_samples = len(SAMPLE_NAMES) if not protocols else \ + sum(sum(1 for l in LIBRARIES if l == p) for p in protocols) + sheet = p.build_sheet(*protocols) + assert exp_num_samples == len(sheet) + if protocols: + fuzzy_protos = {alpha_cased(p) for p in protocols} + for _, sample_data in sheet.iterrows(): + assert alpha_cased(sample_data.library) in fuzzy_protos