This should fix the stats part for EI-CoreBioinformatics#226. Next, c…

…ompare
lucventurini · Oct 8, 2019 · 42f3de3 · 42f3de3
1 parent 21866a0
commit 42f3de3
Show file tree

Hide file tree

Showing 6 changed files with 135 additions and 136 deletions.
diff --git a/Mikado/loci/reference_gene.py b/Mikado/loci/reference_gene.py
@@ -11,6 +11,7 @@
 import operator
 from sys import intern
 from ..transcripts.transcript import Transcript
+from ..transcripts.transcriptcomputer import TranscriptComputer
 from ..exceptions import InvalidTranscript, InvalidCDS
 from ..parsers.GFF import GffLine
 from ..parsers.GTF import GtfLine
@@ -49,11 +50,6 @@ def __init__(self, transcr: [None, Transcript], gid=None, logger=None, only_codi
             elif isinstance(transcr, GffLine):
                 if transcr.is_gene is True:
                     self.__from_gene = True
-                elif (from_exon is False) and ("match" not in transcr.feature):
-                    raise AssertionError(str(transcr))
-                elif from_exon is True:
-                    self.__from_gene = False
-
                 self.id = transcr.id
                 self.attributes = transcr.attributes.copy()
                 self.feature = transcr.feature
@@ -170,7 +166,15 @@ def add_exon(self, row):
                         self.transcripts[tid].add_exon(row)
                     break
             if not found:
-                raise AssertionError("{}\n{}".format(parent, self.transcripts, row))
+                self.transcripts[parent] = TranscriptComputer(row,
+                                                              trust_orf=True,
+                                                              logger=self.logger,
+                                                              accept_undefined_multi=True,
+                                                              source=row.source,
+                                                              is_reference=True,
+                                                              )
+                self.transcripts[parent].parent = self.id
+                # raise AssertionError("{}\n{}".format(parent, self.transcripts, row))
 
     def __getitem__(self, tid: str) -> Transcript:
         return self.transcripts[tid]
@@ -260,7 +264,7 @@ def load_dict(self, state, exclude_utr=False, protein_coding=False, trust_orf=Fa
             setattr(self, key, state[key])
 
         for tid, tvalues in state["transcripts"].items():
-            transcript = Transcript(logger=self.logger)
+            transcript = TranscriptComputer(logger=self.logger)
             transcript.load_dict(tvalues, trust_orf=trust_orf)
             transcript.finalize()
             if protein_coding is True and transcript.is_coding is False:

diff --git a/Mikado/subprograms/util/stats.py b/Mikado/subprograms/util/stats.py
@@ -7,7 +7,7 @@
 import sys
 import argparse
 import csv
-from ...exceptions import InvalidCDS
+from ...transcripts.transcriptcomputer import TranscriptComputer
 from .. import to_gff
 from ...loci import Transcript, Gene
 from ...parsers.GFF import GFF3
@@ -92,84 +92,6 @@ def weighted_percentile(a, percentile=numpy.array([75, 25]), weights=None):
     return out_percentiles
 
 
-class TranscriptComputer(Transcript):
-    """
-    Class that is used to calculate and store basic statistics about a transcript object.
-    """
-
-    data_fields = ["parent", 'chrom',
-                   'start', 'end',
-                   'introns', 'exons',
-                   'exon_lengths', 'intron_lengths',
-                   'cdna_length', 'selected_cds_length',
-                   'cds_intron_lengths', 'cds_exon_lengths',
-                   "five_utr_length", "three_utr_length",
-                   "five_utr_num", "three_utr_num",
-                   "selected_end_distance_from_junction"]
-    data_tuple = namedtuple("transcript_data", data_fields)
-
-    def __init__(self, *args, **kwargs):
-        super().__init__(accept_undefined_multi=True, *args, **kwargs)
-        self.exon_lengths = []
-        self.cds_exon_lengths = []
-        self.utr_exon_lengths = []
-
-        self.intron_lengths = []
-        self.cds_intron_lengths = []
-        self.utr_intron_lengths = []
-
-    def finalize(self):
-        """
-        Method to be called when all exons/features have been
-        added to the transcript. It will call the parent's finalize method,
-        followed by calculation of the necessary statistics.
-        """
-        try:
-            super().finalize()
-        except InvalidCDS:
-            super().strip_cds()
-
-        self.exon_lengths = [e[1] - e[0] + 1 for e in self.exons]
-        self.cds_exon_lengths = [c[1] - c[0] + 1 for c in self.selected_cds]
-        self.utr_exon_lengths = [u[1] - u[0] + 1 for u in self.three_utr + self.five_utr]
-
-        self.intron_lengths = [i[1] - i[0] + 1 for i in self.introns]
-        self.cds_intron_lengths = [i[1] - i[0] for i in self.selected_cds_introns]
-        self.utr_intron_lengths = [i[1] - i[0] for i in self.introns if
-                                   i not in self.selected_cds_introns]
-
-    def as_tuple(self):
-        """Method to build a namedtuple containing only the basic information for stat building.
-
-        We want to analyze the following:
-        - cDNA length
-        - CDS length
-        - Exons (number and length)
-        - CDS Exons (number and length)
-        - Introns (number and length)
-        - CDS Introns (number and length)
-        """
-
-        self.finalize()
-        constructor = dict()
-        for field in self.data_fields:
-            constructor[field] = getattr(self, field)
-
-        return self.data_tuple(**constructor)
-
-
-def finalize_wrapper(finalizable):
-
-    """
-    Wrapper around the finalize method for the object
-    :param finalizable: an object with the finalize method
-    :return:
-    """
-
-    finalizable.finalize()
-    return finalizable
-
-
 class Calculator:
 
     """
@@ -252,6 +174,9 @@ def parse_input(self):
                     raise TypeError("No parent found for:\n{0}".format(str(record)))
                 elif record.feature == "match":
                     gid = record.id
+                elif len(record.parent) == 0:
+                    self.__logger.warning("No gene found for %s, creating a mock one.", record.id)
+                    gid = record.id + ".gene"
                 else:
                     gid = record.parent[0]
                     # assert current_gene is not None, record

diff --git a/Mikado/tests/test_transcript_single.py b/Mikado/tests/test_transcript_single.py
@@ -63,11 +63,6 @@ def setUp(self):
         self.assertFalse(self.orf.invalid, self.orf.invalid_reason)
         self.assertEqual((self.orf.thick_end - self.orf.thick_start + 1) % 3, 0)
 
-    def test_invalid_inizialization(self):
-
-        with self.assertRaises(TypeError):
-            _ =  loci.Transcript(self.tr_gff_lines[1])
-
     def test_basics(self):
 
         self.assertEqual(self.tr.chrom, "Chr1")

diff --git a/Mikado/transcripts/transcript.py b/Mikado/transcripts/transcript.py
@@ -537,8 +537,6 @@ def __initialize_with_gf(self, transcript_row: (GffLine, GtfLine)):
         if transcript_row.is_transcript is False:
             if transcript_row.is_exon is False and transcript_row.feature not in ("match", "tss", "tts"):
                 raise TypeError("Invalid GF line")
-            elif transcript_row.is_exon is True and isinstance(transcript_row, GffLine) and transcript_row.feature not in ("cDNA_match", "match"):
-                raise TypeError("GFF files should not provide orphan exons. Line:\n{}".format(transcript_row))
             self.__expandable = True
 
             if "cDNA_match" in transcript_row.feature and isinstance(transcript_row, GffLine):
@@ -562,6 +560,8 @@ def __initialize_with_gf(self, transcript_row: (GffLine, GtfLine)):
                 #         self.start = transcript_row.end
                 #     else:
                 #         self.start = transcript_row.start
+            elif transcript_row.is_exon is True and isinstance(transcript_row, GffLine) and transcript_row.feature not in ("cDNA_match", "match"):
+                self.id = transcript_row.parent[0]
             else:
                 self.parent = transcript_row.gene
                 self.id = transcript_row.transcript
@@ -1644,7 +1644,8 @@ def get_available_metrics(cls) -> list:
     def get_modifiable_metrics(cls) -> list:
 
         metrics = [member[0] for member in inspect.getmembers(cls) if
-                   "__" not in member[0] and isinstance(cls.__dict__[member[0]], Metric)
+                   "__" not in member[0] and member[0] in cls.__dict__ and
+                   isinstance(cls.__dict__[member[0]], Metric)
                    and getattr(cls.__dict__[member[0]], "fset") is not None]
         return metrics
 

diff --git a/Mikado/transcripts/transcript_methods/finalizing.py b/Mikado/transcripts/transcript_methods/finalizing.py
@@ -5,6 +5,7 @@
 
 from Mikado.utilities.intervaltree import Interval, IntervalTree
 from Mikado.utilities.overlap import overlap
+from collections import defaultdict
 import operator
 # from sys import intern
 from Mikado.exceptions import InvalidCDS, InvalidTranscript
@@ -137,66 +138,66 @@ def _check_cdna_vs_utr(transcript):
         orfs = [IntervalTree.from_tuples([_[1] for _ in orf if _[0] == "CDS"]) for orf in transcript.internal_orfs]
         assert isinstance(combined_cds, IntervalTree)
 
+        exons = IntervalTree.from_intervals([Interval(*exon) for exon in transcript.exons])
+
+        mapper = defaultdict(list)
+        for cds in transcript.combined_cds:
+            fexon = exons.find(cds[0] - 1, cds[1], strict=False)
+            if len(fexon) > 1:
+                raise InvalidCDS(
+                    "{} has a CDS ({}) which straddles {} different exons ({}).".format(
+                        transcript.id, cds, len(fexon), fexon
+                    )
+                )
+            elif len(fexon) == 0:
+                raise InvalidCDS(
+                    "{} has a CDS ({}) which is not mapped to any exon.".format(
+                        transcript.id, cds, len(fexon), fexon
+                    )
+                )
+            mapper[fexon[0]].append(cds)
+
         for exon in transcript.exons:
-            assert isinstance(exon, tuple), type(exon)
-            found = combined_cds.find(exon[0], exon[1])
-            if len(found) == 0 and exon not in transcript.combined_cds:  # Second condition due to BUG
-                # Exon completely noncoding
+            if exon not in mapper:
                 transcript.combined_utr.append(exon)
-            elif len(found) == 0:
-                # Bug, see above
                 continue
-            elif len(found) == 1:
-                found = found[0]
-                if found.start == exon[0] and found.end == exon[1]:
-                    # The exon is completely coding
+            elif len(mapper[exon]) == 1:
+                cds = mapper[exon][0]
+                if cds[0] == exon[0] and exon[1] == cds[1]:
                     continue
                 else:
-                    # I have to find all the regions of the exon which are not coding
                     before = None
                     after = None
-                    if found.start > exon[0]:
-                        before = (exon[0], max(found.start - 1, exon[0]))
+                    if cds[0] < exon[0] or cds[1] > exon[1]:
+                        raise InvalidCDS(
+                            "{} in {} debords its exon {}".format(cds, transcript.id, exon)
+                        )
+                    if cds[0] > exon[0]:
+                        before = (exon[0], max(cds[0] - 1, exon[0]))
                         transcript.combined_utr.append(before)
-                    if found.end < exon[1]:
-                        after = (min(found.end + 1, exon[1]), exon[1])
+                    if cds[1] < exon[1]:
+                        after = (min(cds[1] + 1, exon[1]), exon[1])
                         transcript.combined_utr.append(after)
-                    assert before or after, (exon, found)
+                    assert before or after, (exon, cds)
             else:
-                # The exon is overlapping *two* different CDS segments! This is valid *only* if there are multiple ORFs
-                if len(found) > len(transcript.internal_orfs):
-                    raise InvalidCDS(
-                        "Found in {} an exon ({}) which is overlapping with more CDS segments than there are ORFs.".format(
-                            transcript.id, exon
-                        ))
-                # Now we have to check for each internal ORF that things are OK
-                for orf in orfs:
-                    orf_found = orf.find(exon[0], exon[1])
-                    if len(orf_found) > 1:
-                        raise InvalidCDS(
-                            "Found in {} an exon ({}) which is overlapping with more CDS segments in a single ORF.".format(
-                                transcript.id, exon
-                            ))
-                # If we are here, it means that the internal UTR is legit. We should now add the untranslated regions
-                # to the store.
                 transcript.logger.debug("Starting to find the UTRs for %s", exon)
-                found = sorted(found)
+                found = sorted(mapper[exon])
                 utrs = []
                 for pos, interval in enumerate(found):
                     if pos == len(found) - 1:
-                        if exon[1] > interval.end:
-                            utrs.append((min(exon[1], interval.end + 1), exon[1]))
+                        if exon[1] > interval[1]:
+                            utrs.append((min(exon[1], interval[1] + 1), exon[1]))
                         continue
-                    if pos == 0 and exon[0] < interval.start:
-                        utrs.append((exon[0], max(exon[0], interval.start - 1)))
+                    if pos == 0 and exon[0] < interval[0]:
+                        utrs.append((exon[0], max(exon[0], interval[0] - 1)))
                     next_interval = found[pos + 1]
-                    if not (interval.end + 1 <= next_interval.start - 1):
+                    if not (interval[1] + 1 <= next_interval[0] - 1):
                         raise InvalidCDS(
                             "Error while inferring the UTR for a transcript with multiple ORFs: overlapping CDS found.")
-                    utrs.append((interval.end + 1, next_interval.start - 1))
+                    utrs.append((interval[1] + 1, next_interval[0] - 1))
                 assert utrs, found
                 utr_sum = sum([_[1] - _[0] + 1 for _ in utrs])
-                cds_sum = sum(_.end - _.start + 1 for _ in found)
+                cds_sum = sum(_[1] - _[0] + 1 for _ in found)
                 assert utr_sum + cds_sum == exon[1] - exon[0] + 1, (utr_sum, cds_sum,
                                                                     exon[1] - exon[0] + 1, utrs, found)
                 transcript.combined_utr.extend(utrs)
@@ -259,11 +260,13 @@ def __calculate_introns(transcript):
         # Start calculating the selected CDS introns
         for first, second in zip(transcript.selected_cds[:-1],
                                  transcript.selected_cds[1:]):
-            assert first != second, transcript.selected_cds
+            assert first != second, (transcript.id, transcript.selected_cds)
             # assert first[1] < second[0], (first, second)
             first, second = sorted([first, second])
             intron = tuple([first[1] + 1, second[0] - 1])
-            assert intron in transcript.introns, (intron, first, second)
+            if intron not in transcript.introns:
+                continue
+            # assert intron in transcript.introns, (transcript.id, intron, first, second)
             cds_introns.append(intron)
 
         cintrons = set(cds_introns)

diff --git a/Mikado/transcripts/transcriptcomputer.py b/Mikado/transcripts/transcriptcomputer.py
@@ -0,0 +1,71 @@
+from collections import namedtuple
+from .transcript import Transcript
+from ..exceptions import InvalidCDS
+
+
+class TranscriptComputer(Transcript):
+    """
+    Class that is used to calculate and store basic statistics about a transcript object.
+    """
+
+    data_fields = ["parent", 'chrom',
+                   'start', 'end',
+                   'introns', 'exons',
+                   'exon_lengths', 'intron_lengths',
+                   'cdna_length', 'selected_cds_length',
+                   'cds_intron_lengths', 'cds_exon_lengths',
+                   "five_utr_length", "three_utr_length",
+                   "five_utr_num", "three_utr_num",
+                   "selected_end_distance_from_junction"]
+    data_tuple = namedtuple("transcript_data", data_fields)
+
+    def __init__(self, *args, **kwargs):
+        kwargs.pop("accept_undefined_multi", None)
+        kwargs.pop("trust_orf", None)
+        super().__init__(accept_undefined_multi=True, trust_orf=True, *args, **kwargs)
+        self.exon_lengths = []
+        self.cds_exon_lengths = []
+        self.utr_exon_lengths = []
+
+        self.intron_lengths = []
+        self.cds_intron_lengths = []
+        self.utr_intron_lengths = []
+
+    def finalize(self):
+        """
+        Method to be called when all exons/features have been
+        added to the transcript. It will call the parent's finalize method,
+        followed by calculation of the necessary statistics.
+        """
+        try:
+            super().finalize()
+        except InvalidCDS:
+            super().strip_cds()
+
+        self.exon_lengths = [e[1] - e[0] + 1 for e in self.exons]
+        self.cds_exon_lengths = [c[1] - c[0] + 1 for c in self.selected_cds]
+        self.utr_exon_lengths = [u[1] - u[0] + 1 for u in self.three_utr + self.five_utr]
+
+        self.intron_lengths = [i[1] - i[0] + 1 for i in self.introns]
+        self.cds_intron_lengths = [i[1] - i[0] for i in self.selected_cds_introns]
+        self.utr_intron_lengths = [i[1] - i[0] for i in self.introns if
+                                   i not in self.selected_cds_introns]
+
+    def as_tuple(self):
+        """Method to build a namedtuple containing only the basic information for stat building.
+
+        We want to analyze the following:
+        - cDNA length
+        - CDS length
+        - Exons (number and length)
+        - CDS Exons (number and length)
+        - Introns (number and length)
+        - CDS Introns (number and length)
+        """
+
+        self.finalize()
+        constructor = dict()
+        for field in self.data_fields:
+            constructor[field] = getattr(self, field)
+
+        return self.data_tuple(**constructor)