Skip to content

Commit

Permalink
Bring changes for 270 into 280 (#11)
Browse files Browse the repository at this point in the history
* For EI-CoreBioinformatics#270: now `mikado prepare` will remove redundant transcripts based on their *intron chains* / *monoexonic span overlap*, rather than start/end. Exact CDS match still applies.

* For EI-CoreBioinformatics#270: this commit should introduce all the features asked by @swarbred (tagging also @ljyanesm)
  • Loading branch information
lucventurini committed Apr 7, 2020
1 parent 08d6cbc commit d0a1ddb
Show file tree
Hide file tree
Showing 7 changed files with 281 additions and 120 deletions.
1 change: 1 addition & 0 deletions Mikado/configuration/configuration_blueprint.json
Original file line number Diff line number Diff line change
Expand Up @@ -290,6 +290,7 @@
"labels": {"type": "array", "default": []},
"strand_specific_assemblies": {"type": "array", "default": []},
"reference": {"type": "array", "default": []},
"keep_redundant": {"type": "array", "default": []},
"source_score":{
"type": "object",
"default": {},
Expand Down
15 changes: 13 additions & 2 deletions Mikado/preparation/annotation_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -74,11 +74,11 @@ def run(self):
while True:
results = self.submission_queue.get()
try:
label, handle, strand_specific, is_reference, shelf_name = results
label, handle, strand_specific, is_reference, keep_redundant, shelf_name = results
except ValueError as exc:
raise ValueError("{}.\tValues: {}".format(exc, ", ".join([str(_) for _ in results])))
if handle == "EXIT":
self.submission_queue.put(("EXIT", "EXIT", "EXIT", "EXIT", "EXIT"))
self.submission_queue.put(("EXIT", "EXIT", "EXIT", "EXIT", "EXIT", "EXIT"))
break
counter += 1
self.logger.debug("Received %s (label: %s; SS: %s, shelf_name: %s)",
Expand All @@ -98,6 +98,7 @@ def run(self):
max_intron=self.max_intron,
strip_cds=self.__strip_cds,
is_reference=is_reference,
keep_redundant=keep_redundant,
strand_specific=strand_specific)
elif gff_handle.__annot_type__ == "gtf":
new_ids = load_from_gtf(shelf_name,
Expand All @@ -109,6 +110,7 @@ def run(self):
max_intron=self.max_intron,
is_reference=is_reference,
strip_cds=self.__strip_cds,
keep_redundant=keep_redundant,
strand_specific=strand_specific)
elif gff_handle.__annot_type__ == "bed12":
new_ids = load_from_bed12(shelf_name,
Expand All @@ -120,6 +122,7 @@ def run(self):
min_length=self.min_length,
max_intron=self.max_intron,
strip_cds=self.__strip_cds,
keep_redundant=keep_redundant,
strand_specific=strand_specific)
else:
raise ValueError("Invalid file type: {}".format(gff_handle.name))
Expand Down Expand Up @@ -392,6 +395,7 @@ def load_from_gff(shelf_name,
min_length=0,
max_intron=3*10**5,
is_reference=False,
keep_redundant=False,
strip_cds=False,
strand_specific=False):
"""
Expand Down Expand Up @@ -469,6 +473,7 @@ def load_from_gff(shelf_name,

exon_lines[row.id]["strand_specific"] = strand_specific
exon_lines[row.id]["is_reference"] = is_reference
exon_lines[row.id]["keep_redundant"] = keep_redundant
continue
elif row.is_exon is True:
if not row.is_cds or (row.is_cds is True and strip_cds is False):
Expand Down Expand Up @@ -513,6 +518,7 @@ def load_from_gff(shelf_name,
exon_lines[tid]["parent"] = transcript2genes[tid]
exon_lines[tid]["strand_specific"] = strand_specific
exon_lines[tid]["is_reference"] = is_reference
exon_lines[tid]["keep_redundant"] = keep_redundant
elif tid not in exon_lines and tid not in transcript2genes:
continue
else:
Expand Down Expand Up @@ -547,6 +553,7 @@ def load_from_gtf(shelf_name,
min_length=0,
max_intron=3*10**5,
is_reference=False,
keep_redundant=False,
strip_cds=False,
strand_specific=False):
"""
Expand Down Expand Up @@ -609,6 +616,7 @@ def load_from_gtf(shelf_name,
exon_lines[row.transcript]["parent"] = "{}.gene".format(row.id)
exon_lines[row.transcript]["strand_specific"] = strand_specific
exon_lines[row.transcript]["is_reference"] = is_reference
exon_lines[row.transcript]["keep_redundant"] = keep_redundant
if "exon_number" in exon_lines[row.transcript]["attributes"]:
del exon_lines[row.transcript]["attributes"]["exon_number"]
continue
Expand All @@ -635,6 +643,7 @@ def load_from_gtf(shelf_name,
exon_lines[row.transcript]["parent"] = "{}.gene".format(row.transcript)
exon_lines[row.transcript]["strand_specific"] = strand_specific
exon_lines[row.transcript]["is_reference"] = is_reference
exon_lines[row.transcript]["keep_redundant"] = keep_redundant
else:
if row.transcript in to_ignore:
continue
Expand Down Expand Up @@ -666,6 +675,7 @@ def load_from_bed12(shelf_name,
min_length=0,
max_intron=3*10**5,
is_reference=False,
keep_redundant=False,
strip_cds=False,
strand_specific=False):
"""
Expand Down Expand Up @@ -727,6 +737,7 @@ def load_from_bed12(shelf_name,
exon_lines[transcript.id]["parent"] = "{}.gene".format(transcript.id)
exon_lines[transcript.id]["strand_specific"] = strand_specific
exon_lines[transcript.id]["is_reference"] = is_reference
exon_lines[transcript.id]["keep_redundant"] = keep_redundant
exon_lines[transcript.id]["features"]["exon"] = [
(exon[0], exon[1]) for exon in transcript.exons
]
Expand Down

0 comments on commit d0a1ddb

Please sign in to comment.