Bring changes for 270 into 280 (#11)

* For EI-CoreBioinformatics#270: now `mikado prepare` will remove redundant transcripts based on their *intron chains* / *monoexonic span overlap*, rather than start/end. Exact CDS match still applies. * For EI-CoreBioinformatics#270: this commit should introduce all the features asked by @swarbred (tagging also @ljyanesm)
lucventurini · Apr 7, 2020 · d0a1ddb · d0a1ddb
1 parent 08d6cbc
commit d0a1ddb
Show file tree

Hide file tree

Showing 7 changed files with 281 additions and 120 deletions.
diff --git a/Mikado/configuration/configuration_blueprint.json b/Mikado/configuration/configuration_blueprint.json
@@ -290,6 +290,7 @@
             "labels": {"type": "array", "default": []},
             "strand_specific_assemblies": {"type": "array", "default": []},
             "reference": {"type": "array", "default": []},
+            "keep_redundant": {"type": "array", "default": []},
             "source_score":{
               "type": "object",
               "default": {},

diff --git a/Mikado/preparation/annotation_parser.py b/Mikado/preparation/annotation_parser.py
@@ -74,11 +74,11 @@ def run(self):
         while True:
             results = self.submission_queue.get()
             try:
-                label, handle, strand_specific, is_reference, shelf_name = results
+                label, handle, strand_specific, is_reference, keep_redundant, shelf_name = results
             except ValueError as exc:
                 raise ValueError("{}.\tValues: {}".format(exc, ", ".join([str(_) for _ in results])))
             if handle == "EXIT":
-                self.submission_queue.put(("EXIT", "EXIT", "EXIT", "EXIT", "EXIT"))
+                self.submission_queue.put(("EXIT", "EXIT", "EXIT", "EXIT", "EXIT", "EXIT"))
                 break
             counter += 1
             self.logger.debug("Received %s (label: %s; SS: %s, shelf_name: %s)",
@@ -98,6 +98,7 @@ def run(self):
                                             max_intron=self.max_intron,
                                             strip_cds=self.__strip_cds,
                                             is_reference=is_reference,
+                                            keep_redundant=keep_redundant,
                                             strand_specific=strand_specific)
                 elif gff_handle.__annot_type__ == "gtf":
                     new_ids = load_from_gtf(shelf_name,
@@ -109,6 +110,7 @@ def run(self):
                                             max_intron=self.max_intron,
                                             is_reference=is_reference,
                                             strip_cds=self.__strip_cds,
+                                            keep_redundant=keep_redundant,
                                             strand_specific=strand_specific)
                 elif gff_handle.__annot_type__ == "bed12":
                     new_ids = load_from_bed12(shelf_name,
@@ -120,6 +122,7 @@ def run(self):
                                               min_length=self.min_length,
                                               max_intron=self.max_intron,
                                               strip_cds=self.__strip_cds,
+                                              keep_redundant=keep_redundant,
                                               strand_specific=strand_specific)
                 else:
                     raise ValueError("Invalid file type: {}".format(gff_handle.name))
@@ -392,6 +395,7 @@ def load_from_gff(shelf_name,
                   min_length=0,
                   max_intron=3*10**5,
                   is_reference=False,
+                  keep_redundant=False,
                   strip_cds=False,
                   strand_specific=False):
     """
@@ -469,6 +473,7 @@ def load_from_gff(shelf_name,
 
             exon_lines[row.id]["strand_specific"] = strand_specific
             exon_lines[row.id]["is_reference"] = is_reference
+            exon_lines[row.id]["keep_redundant"] = keep_redundant
             continue
         elif row.is_exon is True:
             if not row.is_cds or (row.is_cds is True and strip_cds is False):
@@ -513,6 +518,7 @@ def load_from_gff(shelf_name,
                         exon_lines[tid]["parent"] = transcript2genes[tid]
                         exon_lines[tid]["strand_specific"] = strand_specific
                         exon_lines[tid]["is_reference"] = is_reference
+                        exon_lines[tid]["keep_redundant"] = keep_redundant
                     elif tid not in exon_lines and tid not in transcript2genes:
                         continue
                     else:
@@ -547,6 +553,7 @@ def load_from_gtf(shelf_name,
                   min_length=0,
                   max_intron=3*10**5,
                   is_reference=False,
+                  keep_redundant=False,
                   strip_cds=False,
                   strand_specific=False):
     """
@@ -609,6 +616,7 @@ def load_from_gtf(shelf_name,
             exon_lines[row.transcript]["parent"] = "{}.gene".format(row.id)
             exon_lines[row.transcript]["strand_specific"] = strand_specific
             exon_lines[row.transcript]["is_reference"] = is_reference
+            exon_lines[row.transcript]["keep_redundant"] = keep_redundant
             if "exon_number" in exon_lines[row.transcript]["attributes"]:
                 del exon_lines[row.transcript]["attributes"]["exon_number"]
             continue
@@ -635,6 +643,7 @@ def load_from_gtf(shelf_name,
             exon_lines[row.transcript]["parent"] = "{}.gene".format(row.transcript)
             exon_lines[row.transcript]["strand_specific"] = strand_specific
             exon_lines[row.transcript]["is_reference"] = is_reference
+            exon_lines[row.transcript]["keep_redundant"] = keep_redundant
         else:
             if row.transcript in to_ignore:
                 continue
@@ -666,6 +675,7 @@ def load_from_bed12(shelf_name,
                     min_length=0,
                     max_intron=3*10**5,
                     is_reference=False,
+                    keep_redundant=False,
                     strip_cds=False,
                     strand_specific=False):
     """
@@ -727,6 +737,7 @@ def load_from_bed12(shelf_name,
             exon_lines[transcript.id]["parent"] = "{}.gene".format(transcript.id)
             exon_lines[transcript.id]["strand_specific"] = strand_specific
             exon_lines[transcript.id]["is_reference"] = is_reference
+            exon_lines[transcript.id]["keep_redundant"] = keep_redundant
             exon_lines[transcript.id]["features"]["exon"] = [
                 (exon[0], exon[1]) for exon in transcript.exons
             ]