Skip to content

Commit

Permalink
This should fix the stats part for EI-CoreBioinformatics#226. Next, c…
Browse files Browse the repository at this point in the history
…ompare
  • Loading branch information
lucventurini committed Oct 8, 2019
1 parent 21866a0 commit 42f3de3
Show file tree
Hide file tree
Showing 6 changed files with 135 additions and 136 deletions.
18 changes: 11 additions & 7 deletions Mikado/loci/reference_gene.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
import operator
from sys import intern
from ..transcripts.transcript import Transcript
from ..transcripts.transcriptcomputer import TranscriptComputer
from ..exceptions import InvalidTranscript, InvalidCDS
from ..parsers.GFF import GffLine
from ..parsers.GTF import GtfLine
Expand Down Expand Up @@ -49,11 +50,6 @@ def __init__(self, transcr: [None, Transcript], gid=None, logger=None, only_codi
elif isinstance(transcr, GffLine):
if transcr.is_gene is True:
self.__from_gene = True
elif (from_exon is False) and ("match" not in transcr.feature):
raise AssertionError(str(transcr))
elif from_exon is True:
self.__from_gene = False

self.id = transcr.id
self.attributes = transcr.attributes.copy()
self.feature = transcr.feature
Expand Down Expand Up @@ -170,7 +166,15 @@ def add_exon(self, row):
self.transcripts[tid].add_exon(row)
break
if not found:
raise AssertionError("{}\n{}".format(parent, self.transcripts, row))
self.transcripts[parent] = TranscriptComputer(row,
trust_orf=True,
logger=self.logger,
accept_undefined_multi=True,
source=row.source,
is_reference=True,
)
self.transcripts[parent].parent = self.id
# raise AssertionError("{}\n{}".format(parent, self.transcripts, row))

def __getitem__(self, tid: str) -> Transcript:
return self.transcripts[tid]
Expand Down Expand Up @@ -260,7 +264,7 @@ def load_dict(self, state, exclude_utr=False, protein_coding=False, trust_orf=Fa
setattr(self, key, state[key])

for tid, tvalues in state["transcripts"].items():
transcript = Transcript(logger=self.logger)
transcript = TranscriptComputer(logger=self.logger)
transcript.load_dict(tvalues, trust_orf=trust_orf)
transcript.finalize()
if protein_coding is True and transcript.is_coding is False:
Expand Down
83 changes: 4 additions & 79 deletions Mikado/subprograms/util/stats.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
import sys
import argparse
import csv
from ...exceptions import InvalidCDS
from ...transcripts.transcriptcomputer import TranscriptComputer
from .. import to_gff
from ...loci import Transcript, Gene
from ...parsers.GFF import GFF3
Expand Down Expand Up @@ -92,84 +92,6 @@ def weighted_percentile(a, percentile=numpy.array([75, 25]), weights=None):
return out_percentiles


class TranscriptComputer(Transcript):
"""
Class that is used to calculate and store basic statistics about a transcript object.
"""

data_fields = ["parent", 'chrom',
'start', 'end',
'introns', 'exons',
'exon_lengths', 'intron_lengths',
'cdna_length', 'selected_cds_length',
'cds_intron_lengths', 'cds_exon_lengths',
"five_utr_length", "three_utr_length",
"five_utr_num", "three_utr_num",
"selected_end_distance_from_junction"]
data_tuple = namedtuple("transcript_data", data_fields)

def __init__(self, *args, **kwargs):
super().__init__(accept_undefined_multi=True, *args, **kwargs)
self.exon_lengths = []
self.cds_exon_lengths = []
self.utr_exon_lengths = []

self.intron_lengths = []
self.cds_intron_lengths = []
self.utr_intron_lengths = []

def finalize(self):
"""
Method to be called when all exons/features have been
added to the transcript. It will call the parent's finalize method,
followed by calculation of the necessary statistics.
"""
try:
super().finalize()
except InvalidCDS:
super().strip_cds()

self.exon_lengths = [e[1] - e[0] + 1 for e in self.exons]
self.cds_exon_lengths = [c[1] - c[0] + 1 for c in self.selected_cds]
self.utr_exon_lengths = [u[1] - u[0] + 1 for u in self.three_utr + self.five_utr]

self.intron_lengths = [i[1] - i[0] + 1 for i in self.introns]
self.cds_intron_lengths = [i[1] - i[0] for i in self.selected_cds_introns]
self.utr_intron_lengths = [i[1] - i[0] for i in self.introns if
i not in self.selected_cds_introns]

def as_tuple(self):
"""Method to build a namedtuple containing only the basic information for stat building.
We want to analyze the following:
- cDNA length
- CDS length
- Exons (number and length)
- CDS Exons (number and length)
- Introns (number and length)
- CDS Introns (number and length)
"""

self.finalize()
constructor = dict()
for field in self.data_fields:
constructor[field] = getattr(self, field)

return self.data_tuple(**constructor)


def finalize_wrapper(finalizable):

"""
Wrapper around the finalize method for the object
:param finalizable: an object with the finalize method
:return:
"""

finalizable.finalize()
return finalizable


class Calculator:

"""
Expand Down Expand Up @@ -252,6 +174,9 @@ def parse_input(self):
raise TypeError("No parent found for:\n{0}".format(str(record)))
elif record.feature == "match":
gid = record.id
elif len(record.parent) == 0:
self.__logger.warning("No gene found for %s, creating a mock one.", record.id)
gid = record.id + ".gene"
else:
gid = record.parent[0]
# assert current_gene is not None, record
Expand Down
5 changes: 0 additions & 5 deletions Mikado/tests/test_transcript_single.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,11 +63,6 @@ def setUp(self):
self.assertFalse(self.orf.invalid, self.orf.invalid_reason)
self.assertEqual((self.orf.thick_end - self.orf.thick_start + 1) % 3, 0)

def test_invalid_inizialization(self):

with self.assertRaises(TypeError):
_ = loci.Transcript(self.tr_gff_lines[1])

def test_basics(self):

self.assertEqual(self.tr.chrom, "Chr1")
Expand Down
7 changes: 4 additions & 3 deletions Mikado/transcripts/transcript.py
Original file line number Diff line number Diff line change
Expand Up @@ -537,8 +537,6 @@ def __initialize_with_gf(self, transcript_row: (GffLine, GtfLine)):
if transcript_row.is_transcript is False:
if transcript_row.is_exon is False and transcript_row.feature not in ("match", "tss", "tts"):
raise TypeError("Invalid GF line")
elif transcript_row.is_exon is True and isinstance(transcript_row, GffLine) and transcript_row.feature not in ("cDNA_match", "match"):
raise TypeError("GFF files should not provide orphan exons. Line:\n{}".format(transcript_row))
self.__expandable = True

if "cDNA_match" in transcript_row.feature and isinstance(transcript_row, GffLine):
Expand All @@ -562,6 +560,8 @@ def __initialize_with_gf(self, transcript_row: (GffLine, GtfLine)):
# self.start = transcript_row.end
# else:
# self.start = transcript_row.start
elif transcript_row.is_exon is True and isinstance(transcript_row, GffLine) and transcript_row.feature not in ("cDNA_match", "match"):
self.id = transcript_row.parent[0]
else:
self.parent = transcript_row.gene
self.id = transcript_row.transcript
Expand Down Expand Up @@ -1644,7 +1644,8 @@ def get_available_metrics(cls) -> list:
def get_modifiable_metrics(cls) -> list:

metrics = [member[0] for member in inspect.getmembers(cls) if
"__" not in member[0] and isinstance(cls.__dict__[member[0]], Metric)
"__" not in member[0] and member[0] in cls.__dict__ and
isinstance(cls.__dict__[member[0]], Metric)
and getattr(cls.__dict__[member[0]], "fset") is not None]
return metrics

Expand Down
87 changes: 45 additions & 42 deletions Mikado/transcripts/transcript_methods/finalizing.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@

from Mikado.utilities.intervaltree import Interval, IntervalTree
from Mikado.utilities.overlap import overlap
from collections import defaultdict
import operator
# from sys import intern
from Mikado.exceptions import InvalidCDS, InvalidTranscript
Expand Down Expand Up @@ -137,66 +138,66 @@ def _check_cdna_vs_utr(transcript):
orfs = [IntervalTree.from_tuples([_[1] for _ in orf if _[0] == "CDS"]) for orf in transcript.internal_orfs]
assert isinstance(combined_cds, IntervalTree)

exons = IntervalTree.from_intervals([Interval(*exon) for exon in transcript.exons])

mapper = defaultdict(list)
for cds in transcript.combined_cds:
fexon = exons.find(cds[0] - 1, cds[1], strict=False)
if len(fexon) > 1:
raise InvalidCDS(
"{} has a CDS ({}) which straddles {} different exons ({}).".format(
transcript.id, cds, len(fexon), fexon
)
)
elif len(fexon) == 0:
raise InvalidCDS(
"{} has a CDS ({}) which is not mapped to any exon.".format(
transcript.id, cds, len(fexon), fexon
)
)
mapper[fexon[0]].append(cds)

for exon in transcript.exons:
assert isinstance(exon, tuple), type(exon)
found = combined_cds.find(exon[0], exon[1])
if len(found) == 0 and exon not in transcript.combined_cds: # Second condition due to BUG
# Exon completely noncoding
if exon not in mapper:
transcript.combined_utr.append(exon)
elif len(found) == 0:
# Bug, see above
continue
elif len(found) == 1:
found = found[0]
if found.start == exon[0] and found.end == exon[1]:
# The exon is completely coding
elif len(mapper[exon]) == 1:
cds = mapper[exon][0]
if cds[0] == exon[0] and exon[1] == cds[1]:
continue
else:
# I have to find all the regions of the exon which are not coding
before = None
after = None
if found.start > exon[0]:
before = (exon[0], max(found.start - 1, exon[0]))
if cds[0] < exon[0] or cds[1] > exon[1]:
raise InvalidCDS(
"{} in {} debords its exon {}".format(cds, transcript.id, exon)
)
if cds[0] > exon[0]:
before = (exon[0], max(cds[0] - 1, exon[0]))
transcript.combined_utr.append(before)
if found.end < exon[1]:
after = (min(found.end + 1, exon[1]), exon[1])
if cds[1] < exon[1]:
after = (min(cds[1] + 1, exon[1]), exon[1])
transcript.combined_utr.append(after)
assert before or after, (exon, found)
assert before or after, (exon, cds)
else:
# The exon is overlapping *two* different CDS segments! This is valid *only* if there are multiple ORFs
if len(found) > len(transcript.internal_orfs):
raise InvalidCDS(
"Found in {} an exon ({}) which is overlapping with more CDS segments than there are ORFs.".format(
transcript.id, exon
))
# Now we have to check for each internal ORF that things are OK
for orf in orfs:
orf_found = orf.find(exon[0], exon[1])
if len(orf_found) > 1:
raise InvalidCDS(
"Found in {} an exon ({}) which is overlapping with more CDS segments in a single ORF.".format(
transcript.id, exon
))
# If we are here, it means that the internal UTR is legit. We should now add the untranslated regions
# to the store.
transcript.logger.debug("Starting to find the UTRs for %s", exon)
found = sorted(found)
found = sorted(mapper[exon])
utrs = []
for pos, interval in enumerate(found):
if pos == len(found) - 1:
if exon[1] > interval.end:
utrs.append((min(exon[1], interval.end + 1), exon[1]))
if exon[1] > interval[1]:
utrs.append((min(exon[1], interval[1] + 1), exon[1]))
continue
if pos == 0 and exon[0] < interval.start:
utrs.append((exon[0], max(exon[0], interval.start - 1)))
if pos == 0 and exon[0] < interval[0]:
utrs.append((exon[0], max(exon[0], interval[0] - 1)))
next_interval = found[pos + 1]
if not (interval.end + 1 <= next_interval.start - 1):
if not (interval[1] + 1 <= next_interval[0] - 1):
raise InvalidCDS(
"Error while inferring the UTR for a transcript with multiple ORFs: overlapping CDS found.")
utrs.append((interval.end + 1, next_interval.start - 1))
utrs.append((interval[1] + 1, next_interval[0] - 1))
assert utrs, found
utr_sum = sum([_[1] - _[0] + 1 for _ in utrs])
cds_sum = sum(_.end - _.start + 1 for _ in found)
cds_sum = sum(_[1] - _[0] + 1 for _ in found)
assert utr_sum + cds_sum == exon[1] - exon[0] + 1, (utr_sum, cds_sum,
exon[1] - exon[0] + 1, utrs, found)
transcript.combined_utr.extend(utrs)
Expand Down Expand Up @@ -259,11 +260,13 @@ def __calculate_introns(transcript):
# Start calculating the selected CDS introns
for first, second in zip(transcript.selected_cds[:-1],
transcript.selected_cds[1:]):
assert first != second, transcript.selected_cds
assert first != second, (transcript.id, transcript.selected_cds)
# assert first[1] < second[0], (first, second)
first, second = sorted([first, second])
intron = tuple([first[1] + 1, second[0] - 1])
assert intron in transcript.introns, (intron, first, second)
if intron not in transcript.introns:
continue
# assert intron in transcript.introns, (transcript.id, intron, first, second)
cds_introns.append(intron)

cintrons = set(cds_introns)
Expand Down
71 changes: 71 additions & 0 deletions Mikado/transcripts/transcriptcomputer.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,71 @@
from collections import namedtuple
from .transcript import Transcript
from ..exceptions import InvalidCDS


class TranscriptComputer(Transcript):
"""
Class that is used to calculate and store basic statistics about a transcript object.
"""

data_fields = ["parent", 'chrom',
'start', 'end',
'introns', 'exons',
'exon_lengths', 'intron_lengths',
'cdna_length', 'selected_cds_length',
'cds_intron_lengths', 'cds_exon_lengths',
"five_utr_length", "three_utr_length",
"five_utr_num", "three_utr_num",
"selected_end_distance_from_junction"]
data_tuple = namedtuple("transcript_data", data_fields)

def __init__(self, *args, **kwargs):
kwargs.pop("accept_undefined_multi", None)
kwargs.pop("trust_orf", None)
super().__init__(accept_undefined_multi=True, trust_orf=True, *args, **kwargs)
self.exon_lengths = []
self.cds_exon_lengths = []
self.utr_exon_lengths = []

self.intron_lengths = []
self.cds_intron_lengths = []
self.utr_intron_lengths = []

def finalize(self):
"""
Method to be called when all exons/features have been
added to the transcript. It will call the parent's finalize method,
followed by calculation of the necessary statistics.
"""
try:
super().finalize()
except InvalidCDS:
super().strip_cds()

self.exon_lengths = [e[1] - e[0] + 1 for e in self.exons]
self.cds_exon_lengths = [c[1] - c[0] + 1 for c in self.selected_cds]
self.utr_exon_lengths = [u[1] - u[0] + 1 for u in self.three_utr + self.five_utr]

self.intron_lengths = [i[1] - i[0] + 1 for i in self.introns]
self.cds_intron_lengths = [i[1] - i[0] for i in self.selected_cds_introns]
self.utr_intron_lengths = [i[1] - i[0] for i in self.introns if
i not in self.selected_cds_introns]

def as_tuple(self):
"""Method to build a namedtuple containing only the basic information for stat building.
We want to analyze the following:
- cDNA length
- CDS length
- Exons (number and length)
- CDS Exons (number and length)
- Introns (number and length)
- CDS Introns (number and length)
"""

self.finalize()
constructor = dict()
for field in self.data_fields:
constructor[field] = getattr(self, field)

return self.data_tuple(**constructor)

0 comments on commit 42f3de3

Please sign in to comment.