Skip to content

Commit

Permalink
This commit should fix all the points regarding prepare for EI-CoreBi…
Browse files Browse the repository at this point in the history
…oinformatics#141 and tests them properly (EI-CoreBioinformatics#137). We should still create tests for the mikado configure step, and update the documentation (EI-CoreBioinformatics#136).
  • Loading branch information
lucventurini committed Nov 6, 2018
1 parent 7caa516 commit a15a3fb
Show file tree
Hide file tree
Showing 11 changed files with 355 additions and 129 deletions.
2 changes: 1 addition & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ Bugfixes and improvements:
- [#134](https://github.com/lucventurini/mikado/issues/134): when checking for potential Alternative Splicing Events (ASEs), now Mikado will check whether the CDS phases are in frame with each other. Moreover
**Mikado will now calculate the CDS overlap percentage based on the primary transcript CDS length**, not the minimum CDS length between primary and candidate. Please note that the change **regarding the frame** also affects the monosublocus stage. Mikado still considers only the primary ORFs for the overlap.
- [#138](https://github.com/lucventurini/mikado/issues/138): Solved a bug which led Mikado to recalculate the phases for each model during picking, potentially creating mistakes for models truncated at the 5' end.
- [#142](https://github.com/lucventurini/mikado/issues/142): padded transcripts will add terminal exons so that
- [#142](https://github.com/lucventurini/mikado/issues/142): padded transcripts will add terminal *exons* rather than just extending their terminal ends. This should prevent the creation of faux retained introns. Please see the issue for more details.

# Version 1.2.4

Expand Down
5 changes: 4 additions & 1 deletion Mikado/configuration/configuration_blueprint.json
Original file line number Diff line number Diff line change
Expand Up @@ -240,7 +240,9 @@
"- out_fasta: output transcript FASTA file",
"- gff: array of input predictions for this step.",
"- log: output log. Default: prepare.log",
"- labels: labels to be associated with the input GFFs. Default: None."
"- labels: labels to be associated with the input GFFs. Default: None.",
"- reference: these files are treated as reference-like, ie, these transcripts will never get discarded",
" during the preparation step."
],
"type": "object",
"required": ["out", "out_fasta", "gff"],
Expand All @@ -252,6 +254,7 @@
"gff": { "type": "array", "default": []},
"labels": {"type": "array", "default": []},
"strand_specific_assemblies": {"type": "array", "default": []},
"reference": {"type": "array", "default": []},
"source_score":{
"type": "object",
"default": {},
Expand Down
2 changes: 1 addition & 1 deletion Mikado/loci/locus.py
Original file line number Diff line number Diff line change
Expand Up @@ -924,7 +924,7 @@ def expand_transcript(transcript, new_starting_exons, new_final_exons, fai, logg
transcript.start, transcript.end, len(genome_seq))
logger.error(error)
raise InvalidTranscript(error)
seq = TranscriptChecker(transcript, genome_seq).cdna
seq = TranscriptChecker(transcript, genome_seq, is_reference=True).cdna
assert len(seq) == transcript.cdna_length, (len(seq), transcript.cdna_length, transcript.exons)
assert len(seq) == backup.cdna_length + upstream + downstream, (
len(seq), backup.cdna_length + upstream + downstream,
Expand Down
54 changes: 45 additions & 9 deletions Mikado/preparation/annotation_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,11 +65,11 @@ def run(self):
while True:
results = self.submission_queue.get()
try:
label, handle, strand_specific, shelf_name = results
label, handle, strand_specific, is_reference, shelf_name = results
except ValueError as exc:
raise ValueError("{}.\tValues: {}".format(exc, ", ".join([str(_) for _ in results])))
if handle == "EXIT":
self.submission_queue.put(("EXIT", "EXIT", "EXIT", "EXIT"))
self.submission_queue.put(("EXIT", "EXIT", "EXIT", "EXIT", "EXIT"))
break
counter += 1
self.logger.debug("Received %s (label: %s; SS: %s, shelf_name: %s)",
Expand All @@ -87,6 +87,7 @@ def run(self):
self.logger,
min_length=self.min_length,
strip_cds=self.__strip_cds,
is_reference=is_reference,
strand_specific=strand_specific)
elif gff_handle.__annot_type__ == "gtf":
new_ids = load_from_gtf(shelf_name,
Expand All @@ -95,6 +96,7 @@ def run(self):
found_ids,
self.logger,
min_length=self.min_length,
is_reference=is_reference,
strip_cds=self.__strip_cds,
strand_specific=strand_specific)
elif gff_handle.__annot_type__ == "bed12":
Expand All @@ -103,6 +105,7 @@ def run(self):
label,
found_ids,
self.logger,
is_reference=is_reference,
min_length=self.min_length,
strip_cds=self.__strip_cds,
strand_specific=strand_specific)
Expand Down Expand Up @@ -182,6 +185,7 @@ def load_into_storage(shelf_name, exon_lines, min_length, logger, strip_cds=True
"CREATE TABLE dump (chrom text, start integer, end integer, strand text, tid text, features blob)")

cursor.execute("CREATE INDEX idx ON dump (tid)")
logger.debug("Created tables for shelf %s", shelf_name)

for tid in exon_lines:
if "features" not in exon_lines[tid]:
Expand Down Expand Up @@ -248,9 +252,13 @@ def load_into_storage(shelf_name, exon_lines, min_length, logger, strip_cds=True

# Discard transcript under a certain size
if tlength < min_length:
logger.info("Discarding %s because its size (%d) is under the minimum of %d",
tid, tlength, min_length)
continue
if exon_lines[tid]["is_reference"] is True:
logger.info("%s retained even if it is too short (%d) as it is a reference transcript.",
tid)
else:
logger.info("Discarding %s because its size (%d) is under the minimum of %d",
tid, tlength, min_length)
continue

values = json.dumps(exon_lines[tid])

Expand All @@ -274,6 +282,7 @@ def load_from_gff(shelf_name,
found_ids,
logger,
min_length=0,
is_reference=False,
strip_cds=False,
strand_specific=False):
"""
Expand All @@ -292,11 +301,16 @@ def load_from_gff(shelf_name,
:type strip_cds: bool
:param strand_specific: whether the assembly is strand-specific or not.
:type strand_specific: bool
:param is_reference: boolean. If set to True, the transcript will always be retained.
:type is_reference: bool
:return:
"""

exon_lines = dict()

strip_cds = strip_cds and (not is_reference)
strand_specific = strand_specific or is_reference

transcript2genes = dict()
new_ids = set()

Expand Down Expand Up @@ -342,6 +356,7 @@ def load_from_gff(shelf_name,
exon_lines[row.id]["features"][row.feature].append((row.start, row.end, row.phase))

exon_lines[row.id]["strand_specific"] = strand_specific
exon_lines[row.id]["is_reference"] = is_reference
continue
elif row.is_exon is True:
if not row.is_cds or (row.is_cds is True and strip_cds is False):
Expand All @@ -366,7 +381,6 @@ def load_from_gff(shelf_name,
row.transcript = ["{0}_{1}".format(label, tid) for tid in row.transcript]

parents = row.transcript[:]
to_delete = set()
for tid in parents:

if tid in found_ids:
Expand All @@ -386,6 +400,7 @@ def load_from_gff(shelf_name,
exon_lines[tid]["tid"] = tid
exon_lines[tid]["parent"] = transcript2genes[tid]
exon_lines[tid]["strand_specific"] = strand_specific
exon_lines[tid]["is_reference"] = is_reference
elif tid not in exon_lines and tid not in transcript2genes:
continue
else:
Expand All @@ -405,7 +420,9 @@ def load_from_gff(shelf_name,
continue
gff_handle.close()

load_into_storage(shelf_name, exon_lines, logger=logger, min_length=min_length, strip_cds=strip_cds)
logger.info("Starting to load %s", shelf_name)
load_into_storage(shelf_name, exon_lines,
logger=logger, min_length=min_length, strip_cds=strip_cds)

return new_ids

Expand All @@ -416,6 +433,7 @@ def load_from_gtf(shelf_name,
found_ids,
logger,
min_length=0,
is_reference=False,
strip_cds=False,
strand_specific=False):
"""
Expand All @@ -434,11 +452,16 @@ def load_from_gtf(shelf_name,
:type strip_cds: bool
:param strand_specific: whether the assembly is strand-specific or not.
:type strand_specific: bool
:param is_reference: boolean. If set to True, the transcript will always be retained.
:type is_reference: bool
:return:
"""

exon_lines = dict()

strip_cds = strip_cds and (not is_reference)
strand_specific = strand_specific or is_reference

# Reduce memory footprint
[intern(_) for _ in ["chrom", "features", "strand", "attributes", "tid", "parent", "attributes"]]

Expand Down Expand Up @@ -470,6 +493,7 @@ def load_from_gtf(shelf_name,
exon_lines[row.transcript]["tid"] = row.id
exon_lines[row.transcript]["parent"] = "{}.gene".format(row.id)
exon_lines[row.transcript]["strand_specific"] = strand_specific
exon_lines[row.transcript]["is_reference"] = is_reference
if "exon_number" in exon_lines[row.transcript]["attributes"]:
del exon_lines[row.id]["attributes"]["exon_number"]
continue
Expand All @@ -494,6 +518,7 @@ def load_from_gtf(shelf_name,
exon_lines[row.id]["tid"] = row.transcript
exon_lines[row.id]["parent"] = "{}.gene".format(row.id)
exon_lines[row.id]["strand_specific"] = strand_specific
exon_lines[row.id]["is_reference"] = is_reference
else:
if row.transcript in to_ignore:
continue
Expand All @@ -509,7 +534,10 @@ def load_from_gtf(shelf_name,
exon_lines[row.transcript]["features"][row.feature].append((row.start, row.end, row.phase))
new_ids.add(row.transcript)
gff_handle.close()
load_into_storage(shelf_name, exon_lines, logger=logger, min_length=min_length, strip_cds=strip_cds)
logger.info("Starting to load %s", shelf_name)
load_into_storage(shelf_name,
exon_lines,
logger=logger, min_length=min_length, strip_cds=strip_cds)

return new_ids

Expand All @@ -520,6 +548,7 @@ def load_from_bed12(shelf_name,
found_ids,
logger,
min_length=0,
is_reference=False,
strip_cds=False,
strand_specific=False):
"""
Expand All @@ -538,11 +567,16 @@ def load_from_bed12(shelf_name,
:type strip_cds: bool
:param strand_specific: whether the assembly is strand-specific or not.
:type strand_specific: bool
:param is_reference: boolean. If set to True, the transcript will always be retained.
:type is_reference: bool
:return:
"""

exon_lines = dict()

strip_cds = strip_cds and (not is_reference)
strand_specific = strand_specific or is_reference

# Reduce memory footprint
[intern(_) for _ in ["chrom", "features", "strand", "attributes", "tid", "parent", "attributes"]]

Expand Down Expand Up @@ -573,6 +607,7 @@ def load_from_bed12(shelf_name,
exon_lines[transcript.id]["tid"] = transcript.id
exon_lines[transcript.id]["parent"] = "{}.gene".format(transcript.id)
exon_lines[transcript.id]["strand_specific"] = strand_specific
exon_lines[transcript.id]["is_reference"] = is_reference
exon_lines[transcript.id]["features"]["exon"] = [
(exon[0], exon[1]) for exon in transcript.exons
]
Expand All @@ -585,6 +620,7 @@ def load_from_bed12(shelf_name,
]
new_ids.add(transcript.id)
gff_handle.close()
load_into_storage(shelf_name, exon_lines, logger=logger, min_length=min_length, strip_cds=strip_cds)
load_into_storage(shelf_name, exon_lines,
logger=logger, min_length=min_length, strip_cds=strip_cds)

return new_ids
13 changes: 11 additions & 2 deletions Mikado/preparation/checking.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ def create_transcript(lines,
start,
end,
lenient=False,
is_reference=False,
strand_specific=False,
canonical_splices=(("GT", "AG"),
("GC", "AG"),
Expand Down Expand Up @@ -51,6 +52,9 @@ def create_transcript(lines,
:param logger: optional logger to use during processing.
:param is_reference: boolean. If set, the transcript's strand will not be checked.
:rtype: (None|TranscriptChecker)
"""

Expand Down Expand Up @@ -94,6 +98,7 @@ def create_transcript(lines,
strand_specific=strand_specific,
canonical_splices=canonical_splices,
force_keep_cds=force_keep_cds,
is_reference=is_reference,
logger=logger)
logger.debug("Finished adding exon lines to %s", lines["tid"])
transcript_object.finalize()
Expand Down Expand Up @@ -128,7 +133,6 @@ def __init__(self,
gtf_out,
tmpdir,
lenient=False,
# strand_specific=False,
canonical_splices=(("GT", "AG"),
("GC", "AG"),
("AT", "AC")),
Expand Down Expand Up @@ -167,7 +171,7 @@ def __init__(self,
def run(self):

checker = functools.partial(create_transcript,
lenient=self.lenient,
# lenient=self.lenient,
# strand_specific=self.strand_specific,
canonical_splices=self.canonical,
logger=self.logger)
Expand All @@ -190,10 +194,15 @@ def run(self):
counter))
break
self.logger.debug("Checking %s", lines["tid"])
if "is_reference" not in lines:
raise KeyError(lines)

transcript = checker(lines,
str(self.fasta[lines["chrom"]][start-1:end]),
start,
end,
lenient=self.lenient,
is_reference=lines["is_reference"],
strand_specific=lines["strand_specific"])

if transcript is None:
Expand Down
Loading

0 comments on commit a15a3fb

Please sign in to comment.