diff --git a/CHANGELOG.md b/CHANGELOG.md index 1da7b9f02..5d1227408 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -35,7 +35,7 @@ Bugfixes and improvements: - [#134](https://github.com/lucventurini/mikado/issues/134): when checking for potential Alternative Splicing Events (ASEs), now Mikado will check whether the CDS phases are in frame with each other. Moreover **Mikado will now calculate the CDS overlap percentage based on the primary transcript CDS length**, not the minimum CDS length between primary and candidate. Please note that the change **regarding the frame** also affects the monosublocus stage. Mikado still considers only the primary ORFs for the overlap. - [#138](https://github.com/lucventurini/mikado/issues/138): Solved a bug which led Mikado to recalculate the phases for each model during picking, potentially creating mistakes for models truncated at the 5' end. -- [#142](https://github.com/lucventurini/mikado/issues/142): padded transcripts will add terminal exons so that +- [#142](https://github.com/lucventurini/mikado/issues/142): padded transcripts will add terminal *exons* rather than just extending their terminal ends. This should prevent the creation of faux retained introns. Please see the issue for more details. # Version 1.2.4 diff --git a/Mikado/configuration/configuration_blueprint.json b/Mikado/configuration/configuration_blueprint.json index 6cd116717..80d9b8963 100644 --- a/Mikado/configuration/configuration_blueprint.json +++ b/Mikado/configuration/configuration_blueprint.json @@ -240,7 +240,9 @@ "- out_fasta: output transcript FASTA file", "- gff: array of input predictions for this step.", "- log: output log. Default: prepare.log", - "- labels: labels to be associated with the input GFFs. Default: None." + "- labels: labels to be associated with the input GFFs. Default: None.", + "- reference: these files are treated as reference-like, ie, these transcripts will never get discarded", + " during the preparation step." ], "type": "object", "required": ["out", "out_fasta", "gff"], @@ -252,6 +254,7 @@ "gff": { "type": "array", "default": []}, "labels": {"type": "array", "default": []}, "strand_specific_assemblies": {"type": "array", "default": []}, + "reference": {"type": "array", "default": []}, "source_score":{ "type": "object", "default": {}, diff --git a/Mikado/loci/locus.py b/Mikado/loci/locus.py index 06db3a040..6dd280836 100644 --- a/Mikado/loci/locus.py +++ b/Mikado/loci/locus.py @@ -924,7 +924,7 @@ def expand_transcript(transcript, new_starting_exons, new_final_exons, fai, logg transcript.start, transcript.end, len(genome_seq)) logger.error(error) raise InvalidTranscript(error) - seq = TranscriptChecker(transcript, genome_seq).cdna + seq = TranscriptChecker(transcript, genome_seq, is_reference=True).cdna assert len(seq) == transcript.cdna_length, (len(seq), transcript.cdna_length, transcript.exons) assert len(seq) == backup.cdna_length + upstream + downstream, ( len(seq), backup.cdna_length + upstream + downstream, diff --git a/Mikado/preparation/annotation_parser.py b/Mikado/preparation/annotation_parser.py index fd6e2a86b..ac3dbca2b 100644 --- a/Mikado/preparation/annotation_parser.py +++ b/Mikado/preparation/annotation_parser.py @@ -65,11 +65,11 @@ def run(self): while True: results = self.submission_queue.get() try: - label, handle, strand_specific, shelf_name = results + label, handle, strand_specific, is_reference, shelf_name = results except ValueError as exc: raise ValueError("{}.\tValues: {}".format(exc, ", ".join([str(_) for _ in results]))) if handle == "EXIT": - self.submission_queue.put(("EXIT", "EXIT", "EXIT", "EXIT")) + self.submission_queue.put(("EXIT", "EXIT", "EXIT", "EXIT", "EXIT")) break counter += 1 self.logger.debug("Received %s (label: %s; SS: %s, shelf_name: %s)", @@ -87,6 +87,7 @@ def run(self): self.logger, min_length=self.min_length, strip_cds=self.__strip_cds, + is_reference=is_reference, strand_specific=strand_specific) elif gff_handle.__annot_type__ == "gtf": new_ids = load_from_gtf(shelf_name, @@ -95,6 +96,7 @@ def run(self): found_ids, self.logger, min_length=self.min_length, + is_reference=is_reference, strip_cds=self.__strip_cds, strand_specific=strand_specific) elif gff_handle.__annot_type__ == "bed12": @@ -103,6 +105,7 @@ def run(self): label, found_ids, self.logger, + is_reference=is_reference, min_length=self.min_length, strip_cds=self.__strip_cds, strand_specific=strand_specific) @@ -182,6 +185,7 @@ def load_into_storage(shelf_name, exon_lines, min_length, logger, strip_cds=True "CREATE TABLE dump (chrom text, start integer, end integer, strand text, tid text, features blob)") cursor.execute("CREATE INDEX idx ON dump (tid)") + logger.debug("Created tables for shelf %s", shelf_name) for tid in exon_lines: if "features" not in exon_lines[tid]: @@ -248,9 +252,13 @@ def load_into_storage(shelf_name, exon_lines, min_length, logger, strip_cds=True # Discard transcript under a certain size if tlength < min_length: - logger.info("Discarding %s because its size (%d) is under the minimum of %d", - tid, tlength, min_length) - continue + if exon_lines[tid]["is_reference"] is True: + logger.info("%s retained even if it is too short (%d) as it is a reference transcript.", + tid) + else: + logger.info("Discarding %s because its size (%d) is under the minimum of %d", + tid, tlength, min_length) + continue values = json.dumps(exon_lines[tid]) @@ -274,6 +282,7 @@ def load_from_gff(shelf_name, found_ids, logger, min_length=0, + is_reference=False, strip_cds=False, strand_specific=False): """ @@ -292,11 +301,16 @@ def load_from_gff(shelf_name, :type strip_cds: bool :param strand_specific: whether the assembly is strand-specific or not. :type strand_specific: bool + :param is_reference: boolean. If set to True, the transcript will always be retained. + :type is_reference: bool :return: """ exon_lines = dict() + strip_cds = strip_cds and (not is_reference) + strand_specific = strand_specific or is_reference + transcript2genes = dict() new_ids = set() @@ -342,6 +356,7 @@ def load_from_gff(shelf_name, exon_lines[row.id]["features"][row.feature].append((row.start, row.end, row.phase)) exon_lines[row.id]["strand_specific"] = strand_specific + exon_lines[row.id]["is_reference"] = is_reference continue elif row.is_exon is True: if not row.is_cds or (row.is_cds is True and strip_cds is False): @@ -366,7 +381,6 @@ def load_from_gff(shelf_name, row.transcript = ["{0}_{1}".format(label, tid) for tid in row.transcript] parents = row.transcript[:] - to_delete = set() for tid in parents: if tid in found_ids: @@ -386,6 +400,7 @@ def load_from_gff(shelf_name, exon_lines[tid]["tid"] = tid exon_lines[tid]["parent"] = transcript2genes[tid] exon_lines[tid]["strand_specific"] = strand_specific + exon_lines[tid]["is_reference"] = is_reference elif tid not in exon_lines and tid not in transcript2genes: continue else: @@ -405,7 +420,9 @@ def load_from_gff(shelf_name, continue gff_handle.close() - load_into_storage(shelf_name, exon_lines, logger=logger, min_length=min_length, strip_cds=strip_cds) + logger.info("Starting to load %s", shelf_name) + load_into_storage(shelf_name, exon_lines, + logger=logger, min_length=min_length, strip_cds=strip_cds) return new_ids @@ -416,6 +433,7 @@ def load_from_gtf(shelf_name, found_ids, logger, min_length=0, + is_reference=False, strip_cds=False, strand_specific=False): """ @@ -434,11 +452,16 @@ def load_from_gtf(shelf_name, :type strip_cds: bool :param strand_specific: whether the assembly is strand-specific or not. :type strand_specific: bool + :param is_reference: boolean. If set to True, the transcript will always be retained. + :type is_reference: bool :return: """ exon_lines = dict() + strip_cds = strip_cds and (not is_reference) + strand_specific = strand_specific or is_reference + # Reduce memory footprint [intern(_) for _ in ["chrom", "features", "strand", "attributes", "tid", "parent", "attributes"]] @@ -470,6 +493,7 @@ def load_from_gtf(shelf_name, exon_lines[row.transcript]["tid"] = row.id exon_lines[row.transcript]["parent"] = "{}.gene".format(row.id) exon_lines[row.transcript]["strand_specific"] = strand_specific + exon_lines[row.transcript]["is_reference"] = is_reference if "exon_number" in exon_lines[row.transcript]["attributes"]: del exon_lines[row.id]["attributes"]["exon_number"] continue @@ -494,6 +518,7 @@ def load_from_gtf(shelf_name, exon_lines[row.id]["tid"] = row.transcript exon_lines[row.id]["parent"] = "{}.gene".format(row.id) exon_lines[row.id]["strand_specific"] = strand_specific + exon_lines[row.id]["is_reference"] = is_reference else: if row.transcript in to_ignore: continue @@ -509,7 +534,10 @@ def load_from_gtf(shelf_name, exon_lines[row.transcript]["features"][row.feature].append((row.start, row.end, row.phase)) new_ids.add(row.transcript) gff_handle.close() - load_into_storage(shelf_name, exon_lines, logger=logger, min_length=min_length, strip_cds=strip_cds) + logger.info("Starting to load %s", shelf_name) + load_into_storage(shelf_name, + exon_lines, + logger=logger, min_length=min_length, strip_cds=strip_cds) return new_ids @@ -520,6 +548,7 @@ def load_from_bed12(shelf_name, found_ids, logger, min_length=0, + is_reference=False, strip_cds=False, strand_specific=False): """ @@ -538,11 +567,16 @@ def load_from_bed12(shelf_name, :type strip_cds: bool :param strand_specific: whether the assembly is strand-specific or not. :type strand_specific: bool + :param is_reference: boolean. If set to True, the transcript will always be retained. + :type is_reference: bool :return: """ exon_lines = dict() + strip_cds = strip_cds and (not is_reference) + strand_specific = strand_specific or is_reference + # Reduce memory footprint [intern(_) for _ in ["chrom", "features", "strand", "attributes", "tid", "parent", "attributes"]] @@ -573,6 +607,7 @@ def load_from_bed12(shelf_name, exon_lines[transcript.id]["tid"] = transcript.id exon_lines[transcript.id]["parent"] = "{}.gene".format(transcript.id) exon_lines[transcript.id]["strand_specific"] = strand_specific + exon_lines[transcript.id]["is_reference"] = is_reference exon_lines[transcript.id]["features"]["exon"] = [ (exon[0], exon[1]) for exon in transcript.exons ] @@ -585,6 +620,7 @@ def load_from_bed12(shelf_name, ] new_ids.add(transcript.id) gff_handle.close() - load_into_storage(shelf_name, exon_lines, logger=logger, min_length=min_length, strip_cds=strip_cds) + load_into_storage(shelf_name, exon_lines, + logger=logger, min_length=min_length, strip_cds=strip_cds) return new_ids \ No newline at end of file diff --git a/Mikado/preparation/checking.py b/Mikado/preparation/checking.py index d675b4824..8023c89bf 100644 --- a/Mikado/preparation/checking.py +++ b/Mikado/preparation/checking.py @@ -21,6 +21,7 @@ def create_transcript(lines, start, end, lenient=False, + is_reference=False, strand_specific=False, canonical_splices=(("GT", "AG"), ("GC", "AG"), @@ -51,6 +52,9 @@ def create_transcript(lines, :param logger: optional logger to use during processing. + :param is_reference: boolean. If set, the transcript's strand will not be checked. + + :rtype: (None|TranscriptChecker) """ @@ -94,6 +98,7 @@ def create_transcript(lines, strand_specific=strand_specific, canonical_splices=canonical_splices, force_keep_cds=force_keep_cds, + is_reference=is_reference, logger=logger) logger.debug("Finished adding exon lines to %s", lines["tid"]) transcript_object.finalize() @@ -128,7 +133,6 @@ def __init__(self, gtf_out, tmpdir, lenient=False, - # strand_specific=False, canonical_splices=(("GT", "AG"), ("GC", "AG"), ("AT", "AC")), @@ -167,7 +171,7 @@ def __init__(self, def run(self): checker = functools.partial(create_transcript, - lenient=self.lenient, + # lenient=self.lenient, # strand_specific=self.strand_specific, canonical_splices=self.canonical, logger=self.logger) @@ -190,10 +194,15 @@ def run(self): counter)) break self.logger.debug("Checking %s", lines["tid"]) + if "is_reference" not in lines: + raise KeyError(lines) + transcript = checker(lines, str(self.fasta[lines["chrom"]][start-1:end]), start, end, + lenient=self.lenient, + is_reference=lines["is_reference"], strand_specific=lines["strand_specific"]) if transcript is None: diff --git a/Mikado/preparation/prepare.py b/Mikado/preparation/prepare.py index e34bc8f9d..b270f9880 100644 --- a/Mikado/preparation/prepare.py +++ b/Mikado/preparation/prepare.py @@ -2,7 +2,7 @@ import tempfile import gc from .checking import create_transcript, CheckingProcess -from .annotation_parser import AnnotationParser, load_from_gtf, load_from_gff +from .annotation_parser import AnnotationParser, load_from_gtf, load_from_gff, load_from_bed12 from ..parsers import to_gff import operator import collections @@ -62,11 +62,15 @@ def store_transcripts(shelf_stacks, logger, keep_redundant=False): transcripts = collections.defaultdict(dict) for shelf_name in shelf_stacks: shelf_score = shelf_stacks[shelf_name]["score"] - for values in shelf_stacks[shelf_name]["cursor"].execute("SELECT chrom, start, end, strand, tid FROM dump"): - chrom, start, end, strand, tid = values - if (start, end) not in transcripts[chrom]: - transcripts[chrom][(start, end)] = list() - transcripts[chrom][(start, end)].append((tid, shelf_name, shelf_score)) + is_reference = shelf_stacks[shelf_name]["is_reference"] + try: + for values in shelf_stacks[shelf_name]["cursor"].execute("SELECT chrom, start, end, strand, tid FROM dump"): + chrom, start, end, strand, tid = values + if (start, end) not in transcripts[chrom]: + transcripts[chrom][(start, end)] = list() + transcripts[chrom][(start, end)].append((tid, shelf_name, shelf_score, is_reference)) + except sqlite3.OperationalError: + raise sqlite3.OperationalError("dump not found in {}".format(shelf_name)) for chrom in sorted(transcripts.keys()): @@ -80,14 +84,14 @@ def store_transcripts(shelf_stacks, logger, keep_redundant=False): if len(tids) > 1: exons = collections.defaultdict(list) di_features = dict() - for tid, shelf, score in tids: + for tid, shelf, score, is_reference in tids: strand, features = next(shelf_stacks[shelf]["cursor"].execute( "select strand, features from dump where tid = ?", (tid,))) features = json.loads(features) exon_set = tuple(sorted([(exon[0], exon[1], strand) for exon in features["features"]["exon"]], key=operator.itemgetter(0, 1))) - exons[exon_set].append((tid, shelf, score)) + exons[exon_set].append((tid, shelf, score, is_reference)) di_features[tid] = features tids = [] logger.debug("%d exon chains for pos %s", @@ -95,26 +99,29 @@ def store_transcripts(shelf_stacks, logger, keep_redundant=False): for tid_list in exons.values(): if len(tid_list) > 1 and keep_redundant is False: cds = collections.defaultdict(list) - for tid, shelf, score in tid_list: + for tid, shelf, score, is_reference in tid_list: cds_set = tuple(sorted([(exon[0], exon[1]) for exon in di_features[tid]["features"].get("CDS", [])])) - cds[cds_set].append((tid, shelf, score)) + cds[cds_set].append((tid, shelf, score, is_reference)) # Now checking the CDS for cds_list in cds.values(): if len(cds_list) > 1: logger.debug("The following transcripts are redundant: %s", ",".join([_[0] for _ in cds_list])) # TODO: we have to select things so that we prioritise transcripts correctly - max_score = max([_[2] for _ in cds_list]) - cds_list = [_ for _ in cds_list if _[2] == max_score] - to_keep = random.choice(cds_list) + # First step: select things that are reference + to_keep = [_ for _ in cds_list if _[3] is True] + if len(to_keep) == 0: + max_score = max([_[2] for _ in cds_list]) + cds_list = [_ for _ in cds_list if _[2] == max_score] + to_keep = [random.choice(cds_list)] + for tid in cds_list: - if tid != to_keep: + if tid not in to_keep: logger.info("Discarding %s as redundant", tid[0]) else: logger.info("Keeping %s amongst redundant transcripts", tid[0]) - - tids.append(to_keep) + tids.extend(to_keep) else: tids.extend(tid_list) @@ -151,8 +158,6 @@ def perform_check(keys, shelve_stacks, args, logger): # with all necessary arguments aside for the lines partial_checker = functools.partial( create_transcript, - lenient=args.json_conf["prepare"]["lenient"], - # strand_specific=args.json_conf["prepare"]["strand_specific"], canonical_splices=args.json_conf["prepare"]["canonical"], logger=logger, force_keep_cds= not args.json_conf["prepare"]["strip_cds"]) @@ -169,6 +174,8 @@ def perform_check(keys, shelve_stacks, args, logger): tobj, str(args.json_conf["reference"]["genome"][chrom][key[0]-1:key[1]]), key[0], key[1], + lenient=args.json_conf["prepare"]["lenient"], + is_reference=tobj["is_reference"], strand_specific=tobj["strand_specific"]) if transcript_object is None: continue @@ -195,7 +202,6 @@ def perform_check(keys, shelve_stacks, args, logger): os.path.basename(args.json_conf["prepare"]["files"]["out"].name), args.tempdir.name, lenient=args.json_conf["prepare"]["lenient"], - # strand_specific=args.json_conf["prepare"]["strand_specific"], canonical_splices=args.json_conf["prepare"]["canonical"], log_level=args.level) for _ in range(args.json_conf["prepare"]["procs"])] @@ -234,6 +240,111 @@ def perform_check(keys, shelve_stacks, args, logger): return +def _load_exon_lines_single_thread(args, shelve_names, logger, min_length, strip_cds): + + logger.info("Starting to load lines from %d files (single-threaded)", + len(args.json_conf["prepare"]["files"]["gff"])) + previous_file_ids = collections.defaultdict(set) + to_do = list(zip( + shelve_names, + args.json_conf["prepare"]["files"]["labels"], + args.json_conf["prepare"]["files"]["strand_specific_assemblies"], + args.json_conf["prepare"]["files"]["reference"], + args.json_conf["prepare"]["files"]["gff"])) + logger.info("To do: %d combinations", len(to_do)) + + for new_shelf, label, strand_specific, is_reference, gff_name in to_do: + logger.info("Starting with %s", gff_name) + gff_handle = to_gff(gff_name) + found_ids = set.union(set(), *previous_file_ids.values()) + if gff_handle.__annot_type__ == "gff3": + new_ids = load_from_gff(new_shelf, + gff_handle, + label, + found_ids, + logger, + min_length=min_length, + strip_cds=strip_cds and not is_reference, + is_reference=is_reference, + strand_specific=strand_specific or is_reference) + elif gff_handle.__annot_type__ == "bed12": + new_ids = load_from_bed12(new_shelf, + gff_handle, + label, + found_ids, + logger, + min_length=min_length, + strip_cds=strip_cds and not is_reference, + is_reference=is_reference, + strand_specific=strand_specific or is_reference) + else: + new_ids = load_from_gtf(new_shelf, + gff_handle, + label, + found_ids, + logger, + min_length=min_length, + strip_cds=strip_cds and not is_reference, + is_reference=is_reference, + strand_specific=strand_specific or is_reference) + + previous_file_ids[gff_handle.name] = new_ids + return + + +def _load_exon_lines_multi(args, shelve_names, logger, min_length, strip_cds, threads): + logger.info("Starting to load lines from %d files (using %d processes)", + len(args.json_conf["prepare"]["files"]["gff"]), threads) + submission_queue = multiprocessing.JoinableQueue(-1) + + working_processes = [] + # working_processes = [ for _ in range(threads)] + + for num in range(threads): + proc = AnnotationParser(submission_queue, + args.logging_queue, + num + 1, + log_level=args.level, + min_length=min_length, + strip_cds=strip_cds) + proc.start() + working_processes.append(proc) + + # [_.start() for _ in working_processes] + for new_shelf, label, strand_specific, is_reference, gff_name in zip( + shelve_names, + args.json_conf["prepare"]["files"]["labels"], + args.json_conf["prepare"]["files"]["strand_specific_assemblies"], + args.json_conf["prepare"]["files"]["reference"], + args.json_conf["prepare"]["files"]["gff"]): + submission_queue.put((label, gff_name, strand_specific, is_reference, new_shelf)) + + submission_queue.put(("EXIT", "EXIT", "EXIT", "EXIT", "EXIT")) + + [_.join() for _ in working_processes] + + tid_counter = Counter() + for shelf in shelve_names: + conn = sqlite3.connect(shelf) + cursor = conn.cursor() + tid_counter.update([_[0] for _ in cursor.execute("SELECT tid FROM dump")]) + if tid_counter.most_common()[0][1] > 1: + if set(args.json_conf["prepare"]["files"]["labels"]) == {""}: + exception = exceptions.RedundantNames( + """Found redundant names during multiprocessed file analysis. + Please repeat using distinct labels for your input files. Aborting.""") + else: + exception = exceptions.RedundantNames( + """Found redundant names during multiprocessed file analysis, even if + unique labels were provided. Please try to repeat with a different and + more unique set of labels. Aborting.""") + logger.exception(exception) + raise exception + + del working_processes + gc.collect() + + def load_exon_lines(args, shelve_names, logger, min_length=0): """This function loads all exon lines from the GFF inputs into a @@ -255,89 +366,9 @@ def load_exon_lines(args, shelve_names, logger, min_length=0): strip_cds = args.json_conf["prepare"]["strip_cds"] if args.json_conf["prepare"]["single"] is True or threads == 1: - - logger.info("Starting to load lines from %d files (single-threaded)", - len(args.json_conf["prepare"]["files"]["gff"])) - previous_file_ids = collections.defaultdict(set) - for new_shelf, label, strand_specific, gff_name in zip( - shelve_names, - args.json_conf["prepare"]["files"]["labels"], - args.json_conf["prepare"]["files"]["strand_specific_assemblies"], - args.json_conf["prepare"]["files"]["gff"]): - logger.info("Starting with %s", gff_name) - gff_handle = to_gff(gff_name) - found_ids = set.union(set(), *previous_file_ids.values()) - if gff_handle.__annot_type__ == "gff3": - new_ids = load_from_gff(new_shelf, - gff_handle, - label, - found_ids, - logger, - min_length=min_length, - strip_cds=strip_cds, - strand_specific=strand_specific) - else: - new_ids = load_from_gtf(new_shelf, - gff_handle, - label, - found_ids, - logger, - min_length=min_length, - strip_cds=strip_cds, - strand_specific=strand_specific) - - previous_file_ids[gff_handle.name] = new_ids + _load_exon_lines_single_thread(args, shelve_names, logger, min_length, strip_cds) else: - logger.info("Starting to load lines from %d files (using %d processes)", - len(args.json_conf["prepare"]["files"]["gff"]), threads) - submission_queue = multiprocessing.JoinableQueue(-1) - - working_processes = [] - # working_processes = [ for _ in range(threads)] - - for num in range(threads): - proc = AnnotationParser(submission_queue, - args.logging_queue, - num + 1, - log_level=args.level, - min_length=min_length, - strip_cds=strip_cds) - proc.start() - working_processes.append(proc) - - # [_.start() for _ in working_processes] - for new_shelf, label, strand_specific, gff_name in zip( - shelve_names, - args.json_conf["prepare"]["files"]["labels"], - args.json_conf["prepare"]["files"]["strand_specific_assemblies"], - args.json_conf["prepare"]["files"]["gff"]): - - submission_queue.put((label, gff_name, strand_specific, new_shelf)) - - submission_queue.put(("EXIT", "EXIT", "EXIT", "EXIT")) - - [_.join() for _ in working_processes] - - tid_counter = Counter() - for shelf in shelve_names: - conn = sqlite3.connect(shelf) - cursor = conn.cursor() - tid_counter.update([_[0] for _ in cursor.execute("SELECT tid FROM dump")]) - if tid_counter.most_common()[0][1] > 1: - if set(args.json_conf["prepare"]["files"]["labels"]) == {""}: - exception = exceptions.RedundantNames( - """Found redundant names during multiprocessed file analysis. - Please repeat using distinct labels for your input files. Aborting.""") - else: - exception = exceptions.RedundantNames( - """Found redundant names during multiprocessed file analysis, even if - unique labels were provided. Please try to repeat with a different and - more unique set of labels. Aborting.""") - logger.exception(exception) - raise exception - - del working_processes - gc.collect() + _load_exon_lines_multi(args, shelve_names, logger, min_length, strip_cds, threads) logger.info("Finished loading lines from %d files", len(args.json_conf["prepare"]["files"]["gff"])) @@ -382,6 +413,11 @@ def prepare(args, logger): (member in args.json_conf["prepare"]["files"]["strand_specific_assemblies"]) for member in args.json_conf["prepare"]["files"]["gff"]] + args.json_conf["prepare"]["files"]["reference"] = [ + (member in args.json_conf["prepare"]["files"]["reference"]) + for member in args.json_conf["prepare"]["files"]["gff"] + ] + shelve_names = [path_join(args.json_conf["prepare"]["files"]["output_dir"], "mikado_shelf_{}.db".format(str(_).zfill(5))) for _ in range(len(args.json_conf["prepare"]["files"]["gff"]))] @@ -438,10 +474,14 @@ def prepare(args, logger): shelve_source_scores.append( args.json_conf["prepare"]["files"]["source_score"].get(label, 0) ) + try: - for shelf, score in zip(shelve_names, shelve_source_scores): + for shelf, score, is_reference in zip(shelve_names, shelve_source_scores, + args.json_conf["prepare"]["files"]["reference"]): + assert isinstance(is_reference, bool) conn = sqlite3.connect(shelf) - shelf_stacks[shelf] = {"conn": conn, "cursor": conn.cursor(), "score": score} + shelf_stacks[shelf] = {"conn": conn, "cursor": conn.cursor(), "score": score, + "is_reference": is_reference} # shelf_stacks = dict((_, shelve.open(_, flag="r")) for _ in shelve_names) except Exception as exc: raise TypeError((shelve_names, exc)) diff --git a/Mikado/subprograms/configure.py b/Mikado/subprograms/configure.py index c3d49c7a2..deb13ecc9 100644 --- a/Mikado/subprograms/configure.py +++ b/Mikado/subprograms/configure.py @@ -199,12 +199,12 @@ def create_config(args): elif args.list: config["prepare"]["files"]["source_score"] = dict() with open(args.list) as list_file: - files, labels, strandedness, scores = [], [], [], [] + files, labels, strandedness, scores, is_references = [], [], [], [], [] files_counter = Counter() for line in list_file: try: _fields = line.rstrip().split("\t") - filename, label, stranded = _fields[:3] + filename, label, stranded = _fields[:3] if not os.path.exists(filename): raise ValueError("Invalid file name: {}".format(filename)) @@ -218,6 +218,15 @@ def create_config(args): else: score = 0 scores.append(score) + if len(_fields) > 4: + if _fields[4] == "True": + is_reference = True + else: + is_reference = False + else: + is_reference = False + is_references.append(is_reference) + except ValueError as exc: raise ValueError("Malformed inputs file. Error:\n{}".format(exc)) files_counter.update(files) @@ -234,6 +243,9 @@ def create_config(args): for source, score in zip(labels, scores): config["prepare"]["files"]["source_score"][source] = score + config["prepare"]["files"]["reference"] = [list(labels)[_[0]] for _ in enumerate(is_references) + if _[1] is True] + elif args.no_files is True: for stage in ["pick", "prepare", "serialise"]: if "files" in config[stage]: diff --git a/Mikado/tests/prepare_misc_test.py b/Mikado/tests/prepare_misc_test.py index 6c494a9c8..4aa5c6700 100644 --- a/Mikado/tests/prepare_misc_test.py +++ b/Mikado/tests/prepare_misc_test.py @@ -218,6 +218,7 @@ def test_example_model_through_process(self): lines["features"] = dict() lines["features"]["exon"] = [(208937, 209593), (209881, 210445)] lines["strand_specific"] = True + lines["is_reference"] = False self.submission_queue.put((lines, lines["start"], lines["end"], 0)) self.submission_queue.put(("EXIT", None, None, None)) proc.start() diff --git a/Mikado/tests/test_system_calls.py b/Mikado/tests/test_system_calls.py index 768250ded..40eead61b 100644 --- a/Mikado/tests/test_system_calls.py +++ b/Mikado/tests/test_system_calls.py @@ -24,6 +24,7 @@ from Mikado.transcripts.transcript import Namespace from Mikado.utilities.log_utils import create_null_logger, create_default_logger from Mikado.parsers.GFF import GffLine +from Mikado.parsers.GTF import GtfLine from Mikado.parsers import to_gff from Mikado.transcripts import Transcript @@ -143,8 +144,10 @@ def test_prepare_trinity_gff(self): prepare.prepare(args, self.logger) # Now that the program has run, let's check the output - fa = pyfaidx.Fasta(os.path.join(self.conf["prepare"]["files"]["output_dir"], - "mikado_prepared.fasta")) + fasta = os.path.join(self.conf["prepare"]["files"]["output_dir"], + "mikado_prepared.fasta") + self.assertGreater(os.stat(fasta).st_size, 0, test_file) + fa = pyfaidx.Fasta(fasta) res = dict((_, len(fa[_])) for _ in fa.keys()) fa.close() self.assertEqual(res, self.trinity_res) @@ -553,6 +556,93 @@ def test_source_selection(self): key = list(fa.keys())[0] self.assertEqual(key, res, round) + def test_reference_selection(self): + + self.conf["prepare"]["files"]["output_dir"] = outdir = tempfile.gettempdir() + self.conf["prepare"]["files"]["out_fasta"] = "mikado_prepared.fasta" + self.conf["prepare"]["files"]["out"] = "mikado_prepared.gtf" + self.conf["prepare"]["strip_cds"] = False + self.conf["prepare"]["keep_redundant"] = False + + self.conf["reference"]["genome"] = self.__genomefile__.name + + t = Transcript() + # This is *key*. Transcript T1 should never be selected, unless we are having a "lenient" analysis. + # However, by flagging T1 as a "reference" transcript, we should retain it + self.conf["prepare"]["lenient"] = False + t.chrom, t.start, t.end, t.strand = "Chr5", 208937, 210445, "-" + t.add_exons([(208937, 209593), (209881, 210445)]) + t.id = "file1.1" + t.parent = "file1" + t.finalize() + + # Likewise. We will also test that even when different scores are applied, we *still* retain all + # transcripts. + t2 = t.deepcopy() + t2.id = "file2.1" + t2.parent = "file2" + + rounds = { + # Standard cases. I expect the transcripts to be reversed, and one to be picked + 0: [0, 0, [], "rand", "+"], + 1: [-2, -2, [], "rand", "+"], + 2: [10, 10, [], "rand", "+"], + # T1 as reference. T1 should be kept, T1 should be on the negative strand. + 3: [0, 0, ["T1"], sorted(["T1_file1.1"]), "-"], + # Both as reference. Both should be kept, both should be on the negative strand. + 4: [1, 0, ["T1", "T2"], sorted(["T1_file1.1", "T2_file2.1"]), "-"], + 5: [0, 0, ["T2"], sorted(["T2_file2.1"]), "-"] + + } + + for fformat in ("gff3", "gtf", "bed12"): + t_file = tempfile.NamedTemporaryFile(mode="wt", suffix=".{}".format(fformat)) + t2_file = tempfile.NamedTemporaryFile(mode="wt", suffix=".{}".format(fformat)) + print(t.format(fformat), file=t_file) + t_file.flush() + print(t2.format(fformat), file=t2_file) + t2_file.flush() + + self.conf["prepare"]["files"]["gff"] = [t_file.name, t2_file.name] + self.conf["prepare"]["files"]["strand_specific_assemblies"] = [t_file.name, t2_file.name] + self.conf["prepare"]["files"]["labels"] = ["T1", "T2"] + + for round in rounds: + for iteration in range(4): # Repeat each test 4 times + with self.subTest(round=round, format=format, iteration=iteration, + msg="Starting round {} ({})".format(round, rounds[round])): + t1_score, t2_score, is_ref, res, corr_strand = rounds[round] + self.conf["prepare"]["files"]["source_score"] = {"T1": t1_score, + "T2": t2_score} + self.conf["prepare"]["files"]["reference"] = [] + for label in is_ref: + if label == "T1": + self.conf["prepare"]["files"]["reference"].append(t_file.name) + elif label == "T2": + self.conf["prepare"]["files"]["reference"].append(t2_file.name) + + args = Namespace() + args.strip_cds = False + args.json_conf = self.conf + with self.assertLogs(self.logger, "INFO") as cm: + prepare.prepare(args, self.logger) + self.assertGreater(os.stat(self.conf["prepare"]["files"]["out_fasta"]).st_size, 0, + (round, fformat, iteration, "\n".join(cm.output))) + fa = pyfaidx.Fasta(os.path.join(outdir, os.path.basename( + self.conf["prepare"]["files"]["out_fasta"]))) + if res != "rand": + key = sorted(list(fa.keys())) + self.assertEqual(key, res, round) + else: + self.assertEqual(len(fa.keys()), 1, (round, fa.keys(), res)) + gtf = os.path.join(outdir, os.path.basename( + self.conf["prepare"]["files"]["out"])) + strand = [_.strand for _ in to_gff(gtf)] + self.assertEqual(len(set(strand)), 1, strand) + self.assertEqual(set(strand).pop(), corr_strand, + (round, self.conf["prepare"]["files"]["reference"], + set(strand), corr_strand)) + class CompareCheck(unittest.TestCase): diff --git a/Mikado/tests/test_transcript_checker.py b/Mikado/tests/test_transcript_checker.py index 15c8f41c1..a35ccd002 100644 --- a/Mikado/tests/test_transcript_checker.py +++ b/Mikado/tests/test_transcript_checker.py @@ -360,6 +360,20 @@ def test_sequence_reversed(self): self.assertEqual(model.strand, "-") self.assertEqual(fasta, TranscriptChecker.rev_complement(seq)) + def test_reference_not_flipped(self): + + model = Transcript() + model.chrom, model.start, model.end = "Chr5", 9930, 13235 + model.id, model.parent, model.strand = "AT5G01030.1", "AT5G01030", "-" + model.add_exons([(9930, 10172), (10620, 12665), (12797, 13235)]) + model.add_exons([(10638, 12665), (12797, 13003)], features="CDS") + model.finalize() + model_fasta = self.fasta["Chr5"][model.start - 1:model.end] + check_model = TranscriptChecker(model, model_fasta, is_reference=True) + check_model.check_strand() + self.assertEqual(check_model.strand, "-") + self.assertGreater(check_model.combined_cds_length, 0) + class StopCodonChecker(unittest.TestCase): diff --git a/Mikado/transcripts/transcriptchecker.py b/Mikado/transcripts/transcriptchecker.py index 5a89881c8..c0ec5225c 100644 --- a/Mikado/transcripts/transcriptchecker.py +++ b/Mikado/transcripts/transcriptchecker.py @@ -33,6 +33,7 @@ class TranscriptChecker(Transcript): # pylint: disable=too-many-arguments def __init__(self, gffline, seq, strand_specific=False, lenient=False, + is_reference=False, canonical_splices=(("GT", "AG"), ("GC", "AG"), ("AT", "AC")), force_keep_cds=False, logger=None): @@ -53,6 +54,7 @@ def __init__(self, gffline, seq, :type lenient: bool """ self.__strand_specific = False + self.__is_reference = False self.mixed_attribute = "" if seq is None: @@ -61,16 +63,19 @@ def __init__(self, gffline, seq, self.__dict__.update(gffline.__dict__) else: super().__init__(gffline) - self.original_strand = self.strand[:] + if self.strand is not None: + self.original_strand = self.strand[:] + else: + self.original_strand = None self.__set_fasta_seq(seq) - # self.fasta_seq = seq - self.strand_specific = strand_specific self.checked = False - self.lenient = lenient self.mixed_splices = False self.reversed = False self.canonical_splices = [] - self.__force_keep_cds = force_keep_cds + self.is_reference = is_reference + self.__force_keep_cds = force_keep_cds or self.is_reference + self.lenient = lenient or self.is_reference + self.strand_specific = strand_specific or self.is_reference if not isinstance(canonical_splices, (tuple, list)): raise ValueError("Canonical splices should be provided as lists or tuples") @@ -80,6 +85,7 @@ def __init__(self, gffline, seq, self.canonical_junctions = [] self.logger = logger + # pylint: enable=too-many-arguments @property @@ -110,6 +116,16 @@ def rev_complement(cls, string): return Seq.reverse_complement(string) + @property + def is_reference(self): + return self.__is_reference + + @is_reference.setter + def is_reference(self, value): + if not isinstance(value, bool): + raise TypeError("This property only accepts boolean values") + self.__is_reference = value + @property def strand_specific(self): """ @@ -227,10 +243,15 @@ def check_strand(self): canonical_counter["+"]) elif canonical_counter["-"] > 0: - self.logger.debug("Transcript %s only has negative splice junctions, (%s), reversing.", + if self.is_reference is False: + self.logger.debug("Transcript %s only has negative splice junctions, (%s), reversing.", self.id, canonical_counter) - self.reverse_strand() - self.reversed = True + self.reverse_strand() + self.reversed = True + else: + self.logger.debug( + "Transcript %s only has negative splice junctions, but as it is a reference \ + we will not reverse it") if canonical_counter["+"] >= canonical_counter["-"] or ( max(canonical_counter["-"], canonical_counter["+"]) > 0 and self.strand_specific is True):