Finish Release-1.0.2

Portcullis can now process BAM files that do not have the SEQ field populated. junctools gtf now considers monoexonic transcripts.
EI-CoreBioinformatics · Jun 8, 2017 · 240d0f4 · 240d0f4
2 parents 78e2203 + 2c6a9aa
commit 240d0f4
Show file tree

Hide file tree

Showing 6 changed files with 111 additions and 63 deletions.
diff --git a/README.md b/README.md
@@ -1,6 +1,7 @@
 ![alt text](doc/source/images/portcullis_logo.png "Portcullis")
 
-#Portcullis
+Portcullis
+==========
 
 Portcullis stands for PORTable CULLing of Invalid Splice junctions from pre-aligned 
 RNA-seq data.  It is known that RNAseq mapping tools generate many invalid junction
@@ -17,7 +18,8 @@ associated with `bad` junctions.  Both the filtered junctions and BAM files are
 and more usable resources which can more effectively be used to assist in downstream 
 analyses such as gene prediction and genome annotation. 
 
-##Installation:
+Installation
+------------
 
 There are two ways to install portcullis from source, either by cloning the git 
 repository, or by downloading a distributable package, the later method is generally 
@@ -69,7 +71,8 @@ located in non-standard locations are:
 Type ```./configure --help``` for full details.
 
 
-##Operating Instructions:
+Operating Instructions
+----------------------
 
 After portcullis has been installed, the ```portcullis``` executable should be available.
 
@@ -89,12 +92,14 @@ In addition to portcullis, we provide a tool-suite for manipulating junction fil
 
 An online version of the manual can be found here: [https://portcullis.readthedocs.org/en/latest/](https://portcullis.readthedocs.org/en/latest/).
 
-##Licensing:
+Licensing
+---------
 
 GNU GPL V3.  See COPYING file for more details.
 
 
-##Authors:
+Authors
+-------
 
  * Daniel Mapleson
  * Luca Venturini
@@ -103,7 +108,8 @@ GNU GPL V3.  See COPYING file for more details.
 See AUTHORS file for more details.
 
 
-##Acknowledgements:
+Acknowledgements
+----------------
 
 Affiliation: The Genome Analysis Centre (TGAC)
 Funding: The Biotechnology and Biological Sciences Research Council (BBSRC)
diff --git a/configure.ac b/configure.ac
@@ -4,7 +4,7 @@
 
 # Autoconf setup
 AC_PREREQ([2.68])
-AC_INIT([portcullis],[1.0.1],[daniel.mapleson@earlham.ac.uk],[portcullis],[http://www.earlham.ac.uk])
+AC_INIT([portcullis],[1.0.2],[daniel.mapleson@earlham.ac.uk],[portcullis],[http://www.earlham.ac.uk])
 AC_CONFIG_SRCDIR([src/portcullis.cc])
 AC_CONFIG_AUX_DIR([build-aux])
 AC_CONFIG_MACRO_DIR([m4])

diff --git a/doc/source/conf.py b/doc/source/conf.py
@@ -53,9 +53,9 @@
 # built documents.
 #
 # The short X.Y version.
-version = '1.0.1'
+version = '1.0.2'
 # The full version, including alpha/beta/rc tags.
-release = '1.0.1'
+release = '1.0.2'
 
 # The language for content autogenerated by Sphinx. Refer to documentation
 # for a list of supported languages.

diff --git a/lib/src/junction.cc b/lib/src/junction.cc
@@ -136,51 +136,69 @@ void portcullis::AlignmentInfo::calcMatchStats(const Intron& i, const uint32_t l
 	int32_t qRightStart = rightStart;
 	int32_t qRightEnd = (int32_t)rightEnd;   
 
-
-	string qAnchorLeft = ba->getPaddedQuerySeq(leftStart, leftEnd, qLeftStart, qLeftEnd, false);
-	string qAnchorRight = ba->getPaddedQuerySeq(rightStart, rightEnd, qRightStart, qRightEnd, false);
-	string gAnchorLeft = ba->getPaddedGenomeSeq(ancLeft, leftStart, leftEnd, qLeftStart, qLeftEnd, false);
-	string gAnchorRight = ba->getPaddedGenomeSeq(ancRight, rightStart, rightEnd, qRightStart, qRightEnd, false);
-	if (qAnchorLeft.size() != gAnchorLeft.size() || qAnchorLeft.empty()) {
-		BOOST_THROW_EXCEPTION(JunctionException() << JunctionErrorInfo(string(
-								  "Left anchor region for query and genome are not the same size.")
-							  + "\nIntron: " + i.toString()
-							  + "\nJunction anchor limits: " + lexical_cast<string>(leftStart) + "," + lexical_cast<string>(rightEnd)
-							  + "\nGenomic sequence: " + ancLeft
-							  + "\nAlignment coords (before soft clipping): " + ba->toString(false)
-							  + "\nAlignment coords (after soft clipping): " + ba->toString(true)
-							  + "\nRead name: " + ba->deriveName()
-							  + "\nRead seq (before soft clipping): " + ba->getQuerySeq() + " (" + lexical_cast<string>(ba->getQuerySeq().size()) + ")"
-							  + "\nRead seq (after soft clipping): " + ba->getQuerySeqAfterClipping() + " (" + lexical_cast<string>(ba->getQuerySeqAfterClipping().size()) + ")"
-							  + "\nCigar: " + ba->getCigarAsString()
-							  + "\nLeft Anchor query seq:  \n" + qAnchorLeft + " (" + lexical_cast<string>(qAnchorLeft.size()) + ")"
-							  + "\nLeft Anchor genome seq: \n" + gAnchorLeft + " (" + lexical_cast<string>(gAnchorLeft.size()) + ")"));
-	}
-	if (qAnchorRight.size() != gAnchorRight.size() || qAnchorRight.empty()) {
-		BOOST_THROW_EXCEPTION(JunctionException() << JunctionErrorInfo(string(
-								  "Right Anchor region for query and genome are not the same size.")
-							  + "\nIntron: " + i.toString()
-							  + "\nJunction anchor limits: " + lexical_cast<string>(leftStart) + "," + lexical_cast<string>(rightEnd)
-							  + "\nGenomic sequence: " + ancRight
-							  + "\nAlignment coords (before soft clipping): " + ba->toString(false)
-							  + "\nAlignment coords (after soft clipping): " + ba->toString(true)
-							  + "\nRead name: " + ba->deriveName()
-							  + "\nRead seq (before soft clipping): " + ba->getQuerySeq() + " (" + lexical_cast<string>(ba->getQuerySeq().size()) + ")"
-							  + "\nRead seq (after soft clipping): " + ba->getQuerySeqAfterClipping() + " (" + lexical_cast<string>(ba->getQuerySeqAfterClipping().size()) + ")"
-							  + "\nCigar: " + ba->getCigarAsString()
-							  + "\nRight Anchor query seq:  \n" + qAnchorRight + " (" + lexical_cast<string>(qAnchorRight.size()) + ")"
-							  + "\nRight Anchor genome seq: \n" + gAnchorRight + " (" + lexical_cast<string>(gAnchorRight.size()) + ")"));
-	}
-	totalUpstreamMismatches = SeqUtils::hammingDistance(qAnchorLeft, gAnchorLeft);
-	totalDownstreamMismatches = SeqUtils::hammingDistance(qAnchorRight, gAnchorRight);
-	totalUpstreamMatches = qAnchorLeft.size() - totalUpstreamMismatches;
-	totalDownstreamMatches = qAnchorRight.size() - totalDownstreamMismatches;
-	nbMismatches = totalUpstreamMismatches + totalDownstreamMismatches;
-	upstreamMatches = getNbMatchesFromEnd(qAnchorLeft, gAnchorLeft);
-	downstreamMatches = getNbMatchesFromStart(qAnchorRight, gAnchorRight);
-	minMatch = min(upstreamMatches, downstreamMatches);
-	maxMatch = max(upstreamMatches, downstreamMatches);
-	mmes = min(totalUpstreamMatches, totalDownstreamMatches);
+    string query = ba->getQuerySeq();
+    if (query.size() <= 1) {
+        // In this case the genome and query sequences do not correspond with one another.  Most
+        // likely the cause of this is that the query sequence is not present in the alignment.  
+        // In which case just assume everything is fine.
+		totalUpstreamMismatches = 0;
+        totalDownstreamMismatches = 0;
+        totalUpstreamMatches = leftEnd - leftStart + 1;
+        totalDownstreamMatches = rightEnd - rightStart + 1;
+        nbMismatches = totalUpstreamMismatches + totalDownstreamMismatches;
+        upstreamMatches = totalUpstreamMismatches;
+        downstreamMatches = totalDownstreamMismatches;
+        minMatch = min(upstreamMatches, downstreamMatches);
+        maxMatch = max(upstreamMatches, downstreamMatches);
+        mmes = min(totalUpstreamMatches, totalDownstreamMatches);
+    }
+    else {
+
+        string qAnchorLeft = ba->getPaddedQuerySeq(query, leftStart, leftEnd, qLeftStart, qLeftEnd, false);
+        string qAnchorRight = ba->getPaddedQuerySeq(query, rightStart, rightEnd, qRightStart, qRightEnd, false);
+        string gAnchorLeft = ba->getPaddedGenomeSeq(ancLeft, leftStart, leftEnd, qLeftStart, qLeftEnd, false);
+        string gAnchorRight = ba->getPaddedGenomeSeq(ancRight, rightStart, rightEnd, qRightStart, qRightEnd, false);    
+        if (qAnchorLeft.size() != gAnchorLeft.size() || qAnchorLeft.empty()) {
+            BOOST_THROW_EXCEPTION(JunctionException() << JunctionErrorInfo(string(
+                                      "Left anchor region for query and genome are not the same size.")
+                                  + "\nIntron: " + i.toString()
+                                  + "\nJunction anchor limits: " + lexical_cast<string>(leftStart) + "," + lexical_cast<string>(rightEnd)
+                                  + "\nGenomic sequence: " + ancLeft
+                                  + "\nAlignment coords (before soft clipping): " + ba->toString(false)
+                                  + "\nAlignment coords (after soft clipping): " + ba->toString(true)
+                                  + "\nRead name: " + ba->deriveName()
+                                  + "\nRead seq (before soft clipping): " + ba->getQuerySeq() + " (" + lexical_cast<string>(ba->getQuerySeq().size()) + ")"
+                                  + "\nRead seq (after soft clipping): " + ba->getQuerySeqAfterClipping() + " (" + lexical_cast<string>(ba->getQuerySeqAfterClipping().size()) + ")"
+                                  + "\nCigar: " + ba->getCigarAsString()
+                                  + "\nLeft Anchor query seq:  \n" + qAnchorLeft + " (" + lexical_cast<string>(qAnchorLeft.size()) + ")"
+                                  + "\nLeft Anchor genome seq: \n" + gAnchorLeft + " (" + lexical_cast<string>(gAnchorLeft.size()) + ")"));
+        }
+        if (qAnchorRight.size() != gAnchorRight.size() || qAnchorRight.empty()) {
+            BOOST_THROW_EXCEPTION(JunctionException() << JunctionErrorInfo(string(
+                                      "Right Anchor region for query and genome are not the same size.")
+                                  + "\nIntron: " + i.toString()
+                                  + "\nJunction anchor limits: " + lexical_cast<string>(leftStart) + "," + lexical_cast<string>(rightEnd)
+                                  + "\nGenomic sequence: " + ancRight
+                                  + "\nAlignment coords (before soft clipping): " + ba->toString(false)
+                                  + "\nAlignment coords (after soft clipping): " + ba->toString(true)
+                                  + "\nRead name: " + ba->deriveName()
+                                  + "\nRead seq (before soft clipping): " + ba->getQuerySeq() + " (" + lexical_cast<string>(ba->getQuerySeq().size()) + ")"
+                                  + "\nRead seq (after soft clipping): " + ba->getQuerySeqAfterClipping() + " (" + lexical_cast<string>(ba->getQuerySeqAfterClipping().size()) + ")"
+                                  + "\nCigar: " + ba->getCigarAsString()
+                                  + "\nRight Anchor query seq:  \n" + qAnchorRight + " (" + lexical_cast<string>(qAnchorRight.size()) + ")"
+                                  + "\nRight Anchor genome seq: \n" + gAnchorRight + " (" + lexical_cast<string>(gAnchorRight.size()) + ")"));
+        }
+        totalUpstreamMismatches = SeqUtils::hammingDistance(qAnchorLeft, gAnchorLeft);
+        totalDownstreamMismatches = SeqUtils::hammingDistance(qAnchorRight, gAnchorRight);
+        totalUpstreamMatches = qAnchorLeft.size() - totalUpstreamMismatches;
+        totalDownstreamMatches = qAnchorRight.size() - totalDownstreamMismatches;
+        nbMismatches = totalUpstreamMismatches + totalDownstreamMismatches;
+        upstreamMatches = getNbMatchesFromEnd(qAnchorLeft, gAnchorLeft);
+        downstreamMatches = getNbMatchesFromStart(qAnchorRight, gAnchorRight);
+        minMatch = min(upstreamMatches, downstreamMatches);
+        maxMatch = max(upstreamMatches, downstreamMatches);
+        mmes = min(totalUpstreamMatches, totalDownstreamMatches);
+    }
 }
 
 uint32_t portcullis::AlignmentInfo::getNbMatchesFromStart(const string& query, const string& anchor) {

diff --git a/scripts/junctools/__init__.py b/scripts/junctools/__init__.py
@@ -9,7 +9,7 @@
 __author__ = 'Daniel Mapleson'
 __license__ = 'GPLV3'
 __copyright__ = 'Copyright 2016 Daniel Mapleson'
-__version__ = '1.0.1'
+__version__ = '1.0.2'
 
 import argparse
 import sys

diff --git a/scripts/junctools/gtf.py b/scripts/junctools/gtf.py
@@ -55,6 +55,7 @@ def loadgtf(filepath, use_strand=False):
 	intron_chains = collections.defaultdict(list)
 	junc_set = set()
 	nb_introns = 0
+	monoexonics = set()
 
 	for t, exons in transcripts.items():
 		le = None
@@ -72,6 +73,17 @@ def loadgtf(filepath, use_strand=False):
 				nb_introns += 1
 
 			le = e
+		if len(exons) == 1:
+			# HACK! Use a junction to represent the mono-exonic transcript
+			e = exons[0]
+			j = BedJunction(use_strand=use_strand)
+			j.refseq = e[0]
+			j.start = int(e[1])
+			j.end = int(e[2])
+			j.strand = e[3]
+			j.id = t
+			monoexonics.add(j)
+
 
 	last_id = ""
 	index = 1
@@ -95,16 +107,21 @@ def loadgtf(filepath, use_strand=False):
 	for j in junc_set:
 		junc_key_set.add(j.key)
 
-	return intron_chains, junc_key_set, len(transcripts), nb_introns
+	return intron_chains, junc_key_set, len(transcripts), nb_introns, monoexonics
 
-def run_compare(args, ref_juncs, ref_ics):
+def run_compare(args, ref_juncs, ref_monos, ref_ics):
 	print("\t".join(["file", "j_distinct", "j_total", "j_tp", "j_fp", "j_fn", "j_recall", "j_precision", "j_f1",
 					 "t_transcripts", "t_monoexonic", "t_multiexonic", "t_supported", "t_unsupported", "t_precision",
+					 "mt_tp", "mt_fp", "mt_fn", "mt_recall", "mt_precision", "mt_f1"
 					 "ic_tp", "ic_fp", "ic_fn", "ic_recall", "ic_precision", "ic_f1"]))
 	sys.stdout.flush()
 	for i in args.input:
-		intron_chains, junc_set, nb_transcripts, nb_introns = loadgtf(i, use_strand=not args.ignore_strand)
+		intron_chains, junc_set, nb_transcripts, nb_introns, monoexonics = loadgtf(i, use_strand=not args.ignore_strand)
 		nb_monoexonic = nb_transcripts - len(intron_chains)
+
+		if nb_monoexonic != len(monoexonics):
+			print("Monoexonic numbers don't agree for", i, nb_monoexonic, len(monoexonics), file=sys.stderr)
+
 		nb_multiexonic = len(intron_chains)
 		if nb_multiexonic <= 0:
 			print("skipped...nb_multiexonic=0")
@@ -146,18 +163,25 @@ def run_compare(args, ref_juncs, ref_ics):
 			else:
 				ic_fp += 1
 
+		mt_tp = len(ref_monos & monoexonics)
+		mt_fn = len(ref_monos - monoexonics)
+		mt_fp = len(monoexonics - ref_monos)
+		mt_perf = Performance(tp=mt_tp, fp=mt_fp, fn=mt_fn)
+
 		nb_unsupported = nb_multiexonic - nb_in_ref
 		t_precision = (nb_in_ref / nb_multiexonic) * 100.0
 
 		ic_fn = len(ref_ics) - ic_tp
 		ic_perf = Performance(tp=ic_tp, fn=ic_fn, fp=ic_fp)
 
-		print("\t".join(str(_) for _ in [i, len(junc_set), nb_introns, jr_tp, jr_fp, jr_fn,
+		print("\t".join(str(_) for _ in [i, len(junc_set), nb_introns,
+										 jr_tp, jr_fp, jr_fn,
 										 "{0:.2f}".format(jr_perf.recall()), "{0:.2f}".format(jr_perf.precision()), "{0:.2f}".format(jr_perf.F1()),
 										 nb_transcripts, nb_monoexonic, nb_multiexonic, nb_in_ref, nb_unsupported, "{0:.2f}".format(t_precision),
+										 mt_tp, mt_fp, mt_fn,
+										 "{0:.2f}".format(mt_perf.recall()), "{0:.2f}".format(mt_perf.precision()), "{0:.2f}".format(mt_perf.F1()),
 										 ic_tp, ic_fp, ic_fn,
-										 "{0:.2f}".format(ic_perf.recall()), "{0:.2f}".format(ic_perf.precision()),
-										 "{0:.2f}".format(ic_perf.F1())]))
+										 "{0:.2f}".format(ic_perf.recall()), "{0:.2f}".format(ic_perf.precision()), "{0:.2f}".format(ic_perf.F1())]))
 		sys.stdout.flush()
 
 def keyFromIC(ic):
@@ -200,12 +224,12 @@ def gtf(args):
 		print(" done.  Found", ref_junc_count, "junctions (", len(ref_juncs), "distinct ) .")
 	else:
 		print("# Extracting junctions from reference transcript file ...", end="")
-		ref_intron_chains, ref_juncs, ref_transcript_count, ref_junc_count = loadgtf(args.transcripts, use_strand=not args.ignore_strand)
+		ref_intron_chains, ref_juncs, ref_transcript_count, ref_junc_count, ref_monos = loadgtf(args.transcripts, use_strand=not args.ignore_strand)
 		ref_ics = convert_ic_map(ref_intron_chains)
 		print(" done.  Found", ref_junc_count, "junctions (", len(ref_juncs), "distinct ) and", ref_transcript_count, "transcripts (", len(ref_ics), "distinct intron chains).")
 
 	if mode == Mode.COMPARE:
-		run_compare(args, ref_juncs, ref_ics)
+		run_compare(args, ref_juncs, ref_monos, ref_ics)
 
 	else:
 		print("Loading transcripts ...",end="")