Skip to content

Commit

Permalink
Finish Release-1.0.2
Browse files Browse the repository at this point in the history
Portcullis can now process BAM files that do not have the SEQ field populated.

junctools gtf now considers monoexonic transcripts.
  • Loading branch information
Daniel Mapleson committed Jun 8, 2017
2 parents 78e2203 + 2c6a9aa commit 240d0f4
Show file tree
Hide file tree
Showing 6 changed files with 111 additions and 63 deletions.
18 changes: 12 additions & 6 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
![alt text](doc/source/images/portcullis_logo.png "Portcullis")

#Portcullis
Portcullis
==========

Portcullis stands for PORTable CULLing of Invalid Splice junctions from pre-aligned
RNA-seq data. It is known that RNAseq mapping tools generate many invalid junction
Expand All @@ -17,7 +18,8 @@ associated with `bad` junctions. Both the filtered junctions and BAM files are
and more usable resources which can more effectively be used to assist in downstream
analyses such as gene prediction and genome annotation.

##Installation:
Installation
------------

There are two ways to install portcullis from source, either by cloning the git
repository, or by downloading a distributable package, the later method is generally
Expand Down Expand Up @@ -69,7 +71,8 @@ located in non-standard locations are:
Type ```./configure --help``` for full details.


##Operating Instructions:
Operating Instructions
----------------------

After portcullis has been installed, the ```portcullis``` executable should be available.

Expand All @@ -89,12 +92,14 @@ In addition to portcullis, we provide a tool-suite for manipulating junction fil

An online version of the manual can be found here: [https://portcullis.readthedocs.org/en/latest/](https://portcullis.readthedocs.org/en/latest/).

##Licensing:
Licensing
---------

GNU GPL V3. See COPYING file for more details.


##Authors:
Authors
-------

* Daniel Mapleson
* Luca Venturini
Expand All @@ -103,7 +108,8 @@ GNU GPL V3. See COPYING file for more details.
See AUTHORS file for more details.


##Acknowledgements:
Acknowledgements
----------------

Affiliation: The Genome Analysis Centre (TGAC)
Funding: The Biotechnology and Biological Sciences Research Council (BBSRC)
2 changes: 1 addition & 1 deletion configure.ac
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@

# Autoconf setup
AC_PREREQ([2.68])
AC_INIT([portcullis],[1.0.1],[daniel.mapleson@earlham.ac.uk],[portcullis],[http://www.earlham.ac.uk])
AC_INIT([portcullis],[1.0.2],[daniel.mapleson@earlham.ac.uk],[portcullis],[http://www.earlham.ac.uk])
AC_CONFIG_SRCDIR([src/portcullis.cc])
AC_CONFIG_AUX_DIR([build-aux])
AC_CONFIG_MACRO_DIR([m4])
Expand Down
4 changes: 2 additions & 2 deletions doc/source/conf.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,9 +53,9 @@
# built documents.
#
# The short X.Y version.
version = '1.0.1'
version = '1.0.2'
# The full version, including alpha/beta/rc tags.
release = '1.0.1'
release = '1.0.2'

# The language for content autogenerated by Sphinx. Refer to documentation
# for a list of supported languages.
Expand Down
108 changes: 63 additions & 45 deletions lib/src/junction.cc
Original file line number Diff line number Diff line change
Expand Up @@ -136,51 +136,69 @@ void portcullis::AlignmentInfo::calcMatchStats(const Intron& i, const uint32_t l
int32_t qRightStart = rightStart;
int32_t qRightEnd = (int32_t)rightEnd;


string qAnchorLeft = ba->getPaddedQuerySeq(leftStart, leftEnd, qLeftStart, qLeftEnd, false);
string qAnchorRight = ba->getPaddedQuerySeq(rightStart, rightEnd, qRightStart, qRightEnd, false);
string gAnchorLeft = ba->getPaddedGenomeSeq(ancLeft, leftStart, leftEnd, qLeftStart, qLeftEnd, false);
string gAnchorRight = ba->getPaddedGenomeSeq(ancRight, rightStart, rightEnd, qRightStart, qRightEnd, false);
if (qAnchorLeft.size() != gAnchorLeft.size() || qAnchorLeft.empty()) {
BOOST_THROW_EXCEPTION(JunctionException() << JunctionErrorInfo(string(
"Left anchor region for query and genome are not the same size.")
+ "\nIntron: " + i.toString()
+ "\nJunction anchor limits: " + lexical_cast<string>(leftStart) + "," + lexical_cast<string>(rightEnd)
+ "\nGenomic sequence: " + ancLeft
+ "\nAlignment coords (before soft clipping): " + ba->toString(false)
+ "\nAlignment coords (after soft clipping): " + ba->toString(true)
+ "\nRead name: " + ba->deriveName()
+ "\nRead seq (before soft clipping): " + ba->getQuerySeq() + " (" + lexical_cast<string>(ba->getQuerySeq().size()) + ")"
+ "\nRead seq (after soft clipping): " + ba->getQuerySeqAfterClipping() + " (" + lexical_cast<string>(ba->getQuerySeqAfterClipping().size()) + ")"
+ "\nCigar: " + ba->getCigarAsString()
+ "\nLeft Anchor query seq: \n" + qAnchorLeft + " (" + lexical_cast<string>(qAnchorLeft.size()) + ")"
+ "\nLeft Anchor genome seq: \n" + gAnchorLeft + " (" + lexical_cast<string>(gAnchorLeft.size()) + ")"));
}
if (qAnchorRight.size() != gAnchorRight.size() || qAnchorRight.empty()) {
BOOST_THROW_EXCEPTION(JunctionException() << JunctionErrorInfo(string(
"Right Anchor region for query and genome are not the same size.")
+ "\nIntron: " + i.toString()
+ "\nJunction anchor limits: " + lexical_cast<string>(leftStart) + "," + lexical_cast<string>(rightEnd)
+ "\nGenomic sequence: " + ancRight
+ "\nAlignment coords (before soft clipping): " + ba->toString(false)
+ "\nAlignment coords (after soft clipping): " + ba->toString(true)
+ "\nRead name: " + ba->deriveName()
+ "\nRead seq (before soft clipping): " + ba->getQuerySeq() + " (" + lexical_cast<string>(ba->getQuerySeq().size()) + ")"
+ "\nRead seq (after soft clipping): " + ba->getQuerySeqAfterClipping() + " (" + lexical_cast<string>(ba->getQuerySeqAfterClipping().size()) + ")"
+ "\nCigar: " + ba->getCigarAsString()
+ "\nRight Anchor query seq: \n" + qAnchorRight + " (" + lexical_cast<string>(qAnchorRight.size()) + ")"
+ "\nRight Anchor genome seq: \n" + gAnchorRight + " (" + lexical_cast<string>(gAnchorRight.size()) + ")"));
}
totalUpstreamMismatches = SeqUtils::hammingDistance(qAnchorLeft, gAnchorLeft);
totalDownstreamMismatches = SeqUtils::hammingDistance(qAnchorRight, gAnchorRight);
totalUpstreamMatches = qAnchorLeft.size() - totalUpstreamMismatches;
totalDownstreamMatches = qAnchorRight.size() - totalDownstreamMismatches;
nbMismatches = totalUpstreamMismatches + totalDownstreamMismatches;
upstreamMatches = getNbMatchesFromEnd(qAnchorLeft, gAnchorLeft);
downstreamMatches = getNbMatchesFromStart(qAnchorRight, gAnchorRight);
minMatch = min(upstreamMatches, downstreamMatches);
maxMatch = max(upstreamMatches, downstreamMatches);
mmes = min(totalUpstreamMatches, totalDownstreamMatches);
string query = ba->getQuerySeq();
if (query.size() <= 1) {
// In this case the genome and query sequences do not correspond with one another. Most
// likely the cause of this is that the query sequence is not present in the alignment.
// In which case just assume everything is fine.
totalUpstreamMismatches = 0;
totalDownstreamMismatches = 0;
totalUpstreamMatches = leftEnd - leftStart + 1;
totalDownstreamMatches = rightEnd - rightStart + 1;
nbMismatches = totalUpstreamMismatches + totalDownstreamMismatches;
upstreamMatches = totalUpstreamMismatches;
downstreamMatches = totalDownstreamMismatches;
minMatch = min(upstreamMatches, downstreamMatches);
maxMatch = max(upstreamMatches, downstreamMatches);
mmes = min(totalUpstreamMatches, totalDownstreamMatches);
}
else {

string qAnchorLeft = ba->getPaddedQuerySeq(query, leftStart, leftEnd, qLeftStart, qLeftEnd, false);
string qAnchorRight = ba->getPaddedQuerySeq(query, rightStart, rightEnd, qRightStart, qRightEnd, false);
string gAnchorLeft = ba->getPaddedGenomeSeq(ancLeft, leftStart, leftEnd, qLeftStart, qLeftEnd, false);
string gAnchorRight = ba->getPaddedGenomeSeq(ancRight, rightStart, rightEnd, qRightStart, qRightEnd, false);
if (qAnchorLeft.size() != gAnchorLeft.size() || qAnchorLeft.empty()) {
BOOST_THROW_EXCEPTION(JunctionException() << JunctionErrorInfo(string(
"Left anchor region for query and genome are not the same size.")
+ "\nIntron: " + i.toString()
+ "\nJunction anchor limits: " + lexical_cast<string>(leftStart) + "," + lexical_cast<string>(rightEnd)
+ "\nGenomic sequence: " + ancLeft
+ "\nAlignment coords (before soft clipping): " + ba->toString(false)
+ "\nAlignment coords (after soft clipping): " + ba->toString(true)
+ "\nRead name: " + ba->deriveName()
+ "\nRead seq (before soft clipping): " + ba->getQuerySeq() + " (" + lexical_cast<string>(ba->getQuerySeq().size()) + ")"
+ "\nRead seq (after soft clipping): " + ba->getQuerySeqAfterClipping() + " (" + lexical_cast<string>(ba->getQuerySeqAfterClipping().size()) + ")"
+ "\nCigar: " + ba->getCigarAsString()
+ "\nLeft Anchor query seq: \n" + qAnchorLeft + " (" + lexical_cast<string>(qAnchorLeft.size()) + ")"
+ "\nLeft Anchor genome seq: \n" + gAnchorLeft + " (" + lexical_cast<string>(gAnchorLeft.size()) + ")"));
}
if (qAnchorRight.size() != gAnchorRight.size() || qAnchorRight.empty()) {
BOOST_THROW_EXCEPTION(JunctionException() << JunctionErrorInfo(string(
"Right Anchor region for query and genome are not the same size.")
+ "\nIntron: " + i.toString()
+ "\nJunction anchor limits: " + lexical_cast<string>(leftStart) + "," + lexical_cast<string>(rightEnd)
+ "\nGenomic sequence: " + ancRight
+ "\nAlignment coords (before soft clipping): " + ba->toString(false)
+ "\nAlignment coords (after soft clipping): " + ba->toString(true)
+ "\nRead name: " + ba->deriveName()
+ "\nRead seq (before soft clipping): " + ba->getQuerySeq() + " (" + lexical_cast<string>(ba->getQuerySeq().size()) + ")"
+ "\nRead seq (after soft clipping): " + ba->getQuerySeqAfterClipping() + " (" + lexical_cast<string>(ba->getQuerySeqAfterClipping().size()) + ")"
+ "\nCigar: " + ba->getCigarAsString()
+ "\nRight Anchor query seq: \n" + qAnchorRight + " (" + lexical_cast<string>(qAnchorRight.size()) + ")"
+ "\nRight Anchor genome seq: \n" + gAnchorRight + " (" + lexical_cast<string>(gAnchorRight.size()) + ")"));
}
totalUpstreamMismatches = SeqUtils::hammingDistance(qAnchorLeft, gAnchorLeft);
totalDownstreamMismatches = SeqUtils::hammingDistance(qAnchorRight, gAnchorRight);
totalUpstreamMatches = qAnchorLeft.size() - totalUpstreamMismatches;
totalDownstreamMatches = qAnchorRight.size() - totalDownstreamMismatches;
nbMismatches = totalUpstreamMismatches + totalDownstreamMismatches;
upstreamMatches = getNbMatchesFromEnd(qAnchorLeft, gAnchorLeft);
downstreamMatches = getNbMatchesFromStart(qAnchorRight, gAnchorRight);
minMatch = min(upstreamMatches, downstreamMatches);
maxMatch = max(upstreamMatches, downstreamMatches);
mmes = min(totalUpstreamMatches, totalDownstreamMatches);
}
}

uint32_t portcullis::AlignmentInfo::getNbMatchesFromStart(const string& query, const string& anchor) {
Expand Down
2 changes: 1 addition & 1 deletion scripts/junctools/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
__author__ = 'Daniel Mapleson'
__license__ = 'GPLV3'
__copyright__ = 'Copyright 2016 Daniel Mapleson'
__version__ = '1.0.1'
__version__ = '1.0.2'

import argparse
import sys
Expand Down
40 changes: 32 additions & 8 deletions scripts/junctools/gtf.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,7 @@ def loadgtf(filepath, use_strand=False):
intron_chains = collections.defaultdict(list)
junc_set = set()
nb_introns = 0
monoexonics = set()

for t, exons in transcripts.items():
le = None
Expand All @@ -72,6 +73,17 @@ def loadgtf(filepath, use_strand=False):
nb_introns += 1

le = e
if len(exons) == 1:
# HACK! Use a junction to represent the mono-exonic transcript
e = exons[0]
j = BedJunction(use_strand=use_strand)
j.refseq = e[0]
j.start = int(e[1])
j.end = int(e[2])
j.strand = e[3]
j.id = t
monoexonics.add(j)


last_id = ""
index = 1
Expand All @@ -95,16 +107,21 @@ def loadgtf(filepath, use_strand=False):
for j in junc_set:
junc_key_set.add(j.key)

return intron_chains, junc_key_set, len(transcripts), nb_introns
return intron_chains, junc_key_set, len(transcripts), nb_introns, monoexonics

def run_compare(args, ref_juncs, ref_ics):
def run_compare(args, ref_juncs, ref_monos, ref_ics):
print("\t".join(["file", "j_distinct", "j_total", "j_tp", "j_fp", "j_fn", "j_recall", "j_precision", "j_f1",
"t_transcripts", "t_monoexonic", "t_multiexonic", "t_supported", "t_unsupported", "t_precision",
"mt_tp", "mt_fp", "mt_fn", "mt_recall", "mt_precision", "mt_f1"
"ic_tp", "ic_fp", "ic_fn", "ic_recall", "ic_precision", "ic_f1"]))
sys.stdout.flush()
for i in args.input:
intron_chains, junc_set, nb_transcripts, nb_introns = loadgtf(i, use_strand=not args.ignore_strand)
intron_chains, junc_set, nb_transcripts, nb_introns, monoexonics = loadgtf(i, use_strand=not args.ignore_strand)
nb_monoexonic = nb_transcripts - len(intron_chains)

if nb_monoexonic != len(monoexonics):
print("Monoexonic numbers don't agree for", i, nb_monoexonic, len(monoexonics), file=sys.stderr)

nb_multiexonic = len(intron_chains)
if nb_multiexonic <= 0:
print("skipped...nb_multiexonic=0")
Expand Down Expand Up @@ -146,18 +163,25 @@ def run_compare(args, ref_juncs, ref_ics):
else:
ic_fp += 1

mt_tp = len(ref_monos & monoexonics)
mt_fn = len(ref_monos - monoexonics)
mt_fp = len(monoexonics - ref_monos)
mt_perf = Performance(tp=mt_tp, fp=mt_fp, fn=mt_fn)

nb_unsupported = nb_multiexonic - nb_in_ref
t_precision = (nb_in_ref / nb_multiexonic) * 100.0

ic_fn = len(ref_ics) - ic_tp
ic_perf = Performance(tp=ic_tp, fn=ic_fn, fp=ic_fp)

print("\t".join(str(_) for _ in [i, len(junc_set), nb_introns, jr_tp, jr_fp, jr_fn,
print("\t".join(str(_) for _ in [i, len(junc_set), nb_introns,
jr_tp, jr_fp, jr_fn,
"{0:.2f}".format(jr_perf.recall()), "{0:.2f}".format(jr_perf.precision()), "{0:.2f}".format(jr_perf.F1()),
nb_transcripts, nb_monoexonic, nb_multiexonic, nb_in_ref, nb_unsupported, "{0:.2f}".format(t_precision),
mt_tp, mt_fp, mt_fn,
"{0:.2f}".format(mt_perf.recall()), "{0:.2f}".format(mt_perf.precision()), "{0:.2f}".format(mt_perf.F1()),
ic_tp, ic_fp, ic_fn,
"{0:.2f}".format(ic_perf.recall()), "{0:.2f}".format(ic_perf.precision()),
"{0:.2f}".format(ic_perf.F1())]))
"{0:.2f}".format(ic_perf.recall()), "{0:.2f}".format(ic_perf.precision()), "{0:.2f}".format(ic_perf.F1())]))
sys.stdout.flush()

def keyFromIC(ic):
Expand Down Expand Up @@ -200,12 +224,12 @@ def gtf(args):
print(" done. Found", ref_junc_count, "junctions (", len(ref_juncs), "distinct ) .")
else:
print("# Extracting junctions from reference transcript file ...", end="")
ref_intron_chains, ref_juncs, ref_transcript_count, ref_junc_count = loadgtf(args.transcripts, use_strand=not args.ignore_strand)
ref_intron_chains, ref_juncs, ref_transcript_count, ref_junc_count, ref_monos = loadgtf(args.transcripts, use_strand=not args.ignore_strand)
ref_ics = convert_ic_map(ref_intron_chains)
print(" done. Found", ref_junc_count, "junctions (", len(ref_juncs), "distinct ) and", ref_transcript_count, "transcripts (", len(ref_ics), "distinct intron chains).")

if mode == Mode.COMPARE:
run_compare(args, ref_juncs, ref_ics)
run_compare(args, ref_juncs, ref_monos, ref_ics)

else:
print("Loading transcripts ...",end="")
Expand Down

0 comments on commit 240d0f4

Please sign in to comment.