Skip to content

Commit

Permalink
Handle duplicate derived_columns
Browse files Browse the repository at this point in the history
Previously, duplicated derived columns would yield
broken paths; adding data_source explicitly would duplicate
it, since it's added implicitly, breaking things. This
fix solves the problem by keeping track of already
derived cols and not re-deriving them. Test added.
  • Loading branch information
nsheff committed Jan 24, 2017
1 parent 27c39ca commit 59ddc96
Show file tree
Hide file tree
Showing 4 changed files with 26 additions and 7 deletions.
2 changes: 1 addition & 1 deletion looper/looper.py
Original file line number Diff line number Diff line change
Expand Up @@ -645,7 +645,7 @@ def main():
file_checks=args.file_checks,
looperenv_file=getattr(args, 'env', None))
# add sample sheet
prj.add_sample_sheet()
#prj.add_sample_sheet()

print("Results subdir: " + prj.metadata.results_subdir)
print("Command: " + args.command)
Expand Down
16 changes: 11 additions & 5 deletions looper/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -148,7 +148,7 @@ class Project(AttributeDict):
"""
def __init__(self, config_file, subproject=None, dry=False, permissive=True, file_checks=False, looperenv_file=None):
# super(Project, self).__init__(**config_file)
self.DEBUG = False
self.DEBUG = True

# Initialize local, serial compute as default (no cluster submission)
from pkg_resources import resource_filename
Expand Down Expand Up @@ -187,7 +187,8 @@ def __init__(self, config_file, subproject=None, dry=False, permissive=True, fil

# Derived columns: by default, use data_source
if hasattr(self, "derived_columns"):
self.derived_columns.append("data_source")
if "data_source" not in self.derived_columns: # do not duplicate!
self.derived_columns.append("data_source")
else:
self.derived_columns = ["data_source"]

Expand Down Expand Up @@ -409,6 +410,9 @@ def add_sample_sheet(self, csv=None, permissive=None, file_checks=None):
:type file_checks: bool
"""
# If options are not passed, used what has been set for project
if self.DEBUG:
print("Add sample sheet.")

if permissive is None:
permissive = self.permissive
else:
Expand Down Expand Up @@ -671,6 +675,7 @@ def __init__(self, series, permissive=True):
raise TypeError("Provided object is not a pandas Series.")
super(Sample, self).__init__()
self.merged_cols = {}
self.derived_cols_done = []

# Keep a list of attributes that came from the sample sheet, so we can provide a
# minimal representation of the original sample as provided (in order!).
Expand Down Expand Up @@ -809,7 +814,7 @@ def locate_data_source(self, column_name = "data_source", source_key = None, ext
given a higher priority.
"""
# default_regex = "/scratch/lab_bsf/samples/{flowcell}/{flowcell}_{lane}_samples/{flowcell}_{lane}#{BSF_name}.bam"

if not source_key:
if not hasattr(self, column_name):
raise AttributeError("You must provide a source_key, no attribute: " + source_key)
Expand Down Expand Up @@ -867,11 +872,12 @@ def set_file_paths(self, override=False):
if hasattr(self.prj, "derived_columns"):
for col in self.prj["derived_columns"]:

# Only proceed if the specified column exists, and was not already merged.
if hasattr(self, col) and col not in self.merged_cols:
# Only proceed if the specified column exists, and was not already merged or derived.
if hasattr(self, col) and col not in self.merged_cols and col not in self.derived_cols_done:
# set a variable called {col}_key, so the original source can also be retrieved
setattr(self, col + "_key", getattr(self, col))
setattr(self, col, self.locate_data_source(col))
self.derived_cols_done.append(col)

# parent
self.results_subdir = self.prj.metadata.results_subdir
Expand Down
13 changes: 13 additions & 0 deletions test_looper.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,17 @@ def test1(self):
self.assertTrue(s2.confirm_required_inputs())
self.assertEqual([os.path.basename(x) for x in s2.required_inputs], ['c.txt', 'c.txt'])

# Make sure derived cols don't get re-derived upon multiple calls of add_sample_sheet()
self.assertEqual(p.samples[2].file, "tests/data/c.txt")
p.add_sample_sheet()
p.add_sample_sheet()
self.assertEqual(p.samples[2].file, "tests/data/c.txt")

# Check that duplicate derived cols can still be derived
self.assertEqual(p.samples[2].nonmerged_col, "tests/data/c.txt")
self.assertEqual(p.samples[2].locate_data_source('file'), "")


# Can't set a non-ngs sample to an ngs pipeline
with self.assertRaises(TypeError):
s.set_pipeline_attributes(pi, "testngs.sh")
Expand All @@ -66,6 +77,8 @@ def test1(self):





if __name__ == '__main__':
unittest.main()

2 changes: 1 addition & 1 deletion tests/test_config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ metadata:
pipelines_dir: pipelines
merge_table: merge.csv

derived_columns: [file, file2, dcol1, dcol2]
derived_columns: [file, file2, dcol1, dcol2, nonmerged_col, nonmerged_col, data_source]

data_sources:
src1: "tests/data/{sample_name}{col_modifier}.txt"
Expand Down

0 comments on commit 59ddc96

Please sign in to comment.