Handle duplicate derived_columns

Previously, duplicated derived columns would yield broken paths; adding data_source explicitly would duplicate it, since it's added implicitly, breaking things. This fix solves the problem by keeping track of already derived cols and not re-deriving them. Test added.
pepkit · Jan 24, 2017 · 59ddc96 · 59ddc96
1 parent 27c39ca
commit 59ddc96
Show file tree

Hide file tree

Showing 4 changed files with 26 additions and 7 deletions.
diff --git a/looper/looper.py b/looper/looper.py
@@ -645,7 +645,7 @@ def main():
 		file_checks=args.file_checks,
 		looperenv_file=getattr(args, 'env', None))
 	# add sample sheet
-	prj.add_sample_sheet()
+	#prj.add_sample_sheet()
 
 	print("Results subdir: " + prj.metadata.results_subdir)
 	print("Command: " + args.command)

diff --git a/looper/models.py b/looper/models.py
@@ -148,7 +148,7 @@ class Project(AttributeDict):
 	"""
 	def __init__(self, config_file, subproject=None, dry=False, permissive=True, file_checks=False, looperenv_file=None):
 		# super(Project, self).__init__(**config_file)
-		self.DEBUG = False
+		self.DEBUG = True
 
 		# Initialize local, serial compute as default (no cluster submission)
 		from pkg_resources import resource_filename
@@ -187,7 +187,8 @@ def __init__(self, config_file, subproject=None, dry=False, permissive=True, fil
 
 		# Derived columns: by default, use data_source
 		if hasattr(self, "derived_columns"):
-			self.derived_columns.append("data_source")
+			if "data_source" not in self.derived_columns:  # do not duplicate!
+				self.derived_columns.append("data_source")
 		else:
 			self.derived_columns = ["data_source"]
 
@@ -409,6 +410,9 @@ def add_sample_sheet(self, csv=None, permissive=None, file_checks=None):
 		:type file_checks: bool
 		"""
 		# If options are not passed, used what has been set for project
+		if self.DEBUG:
+			print("Add sample sheet.")
+
 		if permissive is None:
 			permissive = self.permissive
 		else:
@@ -671,6 +675,7 @@ def __init__(self, series, permissive=True):
 			raise TypeError("Provided object is not a pandas Series.")
 		super(Sample, self).__init__()
 		self.merged_cols = {}
+		self.derived_cols_done = []
 
 		# Keep a list of attributes that came from the sample sheet, so we can provide a
 		# minimal representation of the original sample as provided (in order!).
@@ -809,7 +814,7 @@ def locate_data_source(self, column_name = "data_source", source_key = None, ext
 		given a higher priority.
 		"""
 		# default_regex = "/scratch/lab_bsf/samples/{flowcell}/{flowcell}_{lane}_samples/{flowcell}_{lane}#{BSF_name}.bam"
-
+		
 		if not source_key:
 			if not hasattr(self, column_name):
 				raise AttributeError("You must provide a source_key, no attribute: " + source_key)
@@ -867,11 +872,12 @@ def set_file_paths(self, override=False):
 		if hasattr(self.prj, "derived_columns"):
 			for col in self.prj["derived_columns"]:
 
-				# Only proceed if the specified column exists, and was not already merged.
-				if hasattr(self, col) and col not in self.merged_cols:
+				# Only proceed if the specified column exists, and was not already merged or derived.
+				if hasattr(self, col) and col not in self.merged_cols and col not in self.derived_cols_done:
 					# set a variable called {col}_key, so the original source can also be retrieved
 					setattr(self, col + "_key", getattr(self, col))
 					setattr(self, col, self.locate_data_source(col))
+					self.derived_cols_done.append(col)
 
 		# parent
 		self.results_subdir = self.prj.metadata.results_subdir

diff --git a/test_looper.py b/test_looper.py
@@ -49,6 +49,17 @@ def test1(self):
 		self.assertTrue(s2.confirm_required_inputs())
 		self.assertEqual([os.path.basename(x) for x in s2.required_inputs], ['c.txt', 'c.txt'])
 
+		# Make sure derived cols don't get re-derived upon multiple calls of add_sample_sheet()
+		self.assertEqual(p.samples[2].file, "tests/data/c.txt")
+		p.add_sample_sheet()
+		p.add_sample_sheet()
+		self.assertEqual(p.samples[2].file, "tests/data/c.txt")
+
+		# Check that duplicate derived cols can still be derived
+		self.assertEqual(p.samples[2].nonmerged_col, "tests/data/c.txt")
+		self.assertEqual(p.samples[2].locate_data_source('file'), "")
+
+
 		# Can't set a non-ngs sample to an ngs pipeline
 		with self.assertRaises(TypeError):
 			s.set_pipeline_attributes(pi, "testngs.sh")
@@ -66,6 +77,8 @@ def test1(self):
 
 
 
+
+
 if __name__ == '__main__':
 	unittest.main()
 
diff --git a/tests/test_config.yaml b/tests/test_config.yaml
@@ -4,7 +4,7 @@ metadata:
   pipelines_dir: pipelines
   merge_table: merge.csv
 
-derived_columns: [file, file2, dcol1, dcol2]
+derived_columns: [file, file2, dcol1, dcol2, nonmerged_col, nonmerged_col, data_source]
 
 data_sources:
   src1: "tests/data/{sample_name}{col_modifier}.txt"