Skip to content

Commit

Permalink
Special notation for repeated bases in adapter seq: A{3} == AAA
Browse files Browse the repository at this point in the history
  • Loading branch information
marcelm committed Feb 17, 2015
1 parent 20c9d2d commit 5bc4505
Show file tree
Hide file tree
Showing 8 changed files with 175 additions and 5 deletions.
6 changes: 5 additions & 1 deletion CHANGES.rst
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,6 @@
Changes
=======


v1.8
----

Expand All @@ -19,6 +18,11 @@ v1.8

This feature has not been extensively tested, so please give feedback if
something does not work.
* Support notation for repeated bases in the adapter sequence: Write ``A{10}``
instead of ``AAAAAAAAAA``. Useful for poly-A trimming: Use ``-a A{100}`` to
get the longest possible tail.
* Fix incorrectly reported statistics (> 100% trimmed bases) when ``--times``
set to a value greater than one.

v1.7
----
Expand Down
38 changes: 36 additions & 2 deletions cutadapt/adapters.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
"""
from __future__ import print_function, division, absolute_import
import sys
import re
from collections import defaultdict
from cutadapt import align, colorspace
from cutadapt.seqio import ColorspaceSequence, FastaReader
Expand Down Expand Up @@ -37,7 +38,7 @@ def parse_adapter(sequence, where):
(sequence, where).
"""
if where == FRONT and sequence.startswith('^'):
return (sequence[1:], PREFIX)
return (sequence[1:], PREFIX)
if where == BACK and sequence.endswith('$'):
return (sequence[:-1], SUFFIX)
return (sequence, where)
Expand Down Expand Up @@ -166,7 +167,7 @@ def __init__(self, sequence, where, max_error_rate, min_overlap=3,
self.name = name
self.name_is_generated = False

self.sequence = sequence.upper().replace('U', 'T')
self.sequence = self.parse_braces(sequence.upper().replace('U', 'T'))
self.where = where
self.max_error_rate = max_error_rate
self.min_overlap = min_overlap
Expand Down Expand Up @@ -212,6 +213,39 @@ def __repr__(self):
read_wildcards=read_wildcards,
**vars(self))

@staticmethod
def parse_braces(sequence):
# Simple DFA with four states, encoded in prev
result = ''
prev = None
for s in re.split('(\{|\})', sequence):
if s == '':
continue
if prev is None:
if s == '{':
raise ValueError('"{" must be used after a character')
if s == '}':
raise ValueError('"}" cannot be used here')
prev = s
result += s
elif prev == '{':
prev = int(s)
if not 0 <= prev <= 10000:
raise ValueError('Value {} invalid'.format(prev))
elif isinstance(prev, int):
if s != '}':
raise ValueError('"}" expected')
result = result[:-1] + result[-1] * prev
prev = None
else:
if s != '{':
raise ValueError('Expected "{"')
prev = '{'
# Check if we are in a non-terminating state
if isinstance(prev, int) or prev == '{':
raise ValueError("Unterminated expression")
return result

def match_to(self, read):
"""
Try to match this adapter to the given read and return an AdapterMatch instance.
Expand Down
15 changes: 15 additions & 0 deletions doc/guide.rst
Original file line number Diff line number Diff line change
Expand Up @@ -420,6 +420,21 @@ or ``ADAPTER`` as used here in the examples).
Wildcards do not work in colorspace.


Repeated bases in the adapter sequence
--------------------------------------

If you have many repeated bases in the adapter sequence, such as many ``N``s or
many ``A``s, you do not have to spell them out. For example, instead of writing
ten ``A`` in a row (``AAAAAAAAAA``), write ``A{10}`` instead. The number within
the curly braces specifies how often the character that preceeds it will be
repeated. This works also for IUPAC wildcard characters, as in ``N{5}``.

It is recommended that you use quotation marks around your adapter sequence if
you use this feature. For poly-A trimming, for example, you would write::

cutadapt -a "A{100}" -o output.fastq input.fastq


.. _cut-bases:

Removing a fixed number of bases
Expand Down
1 change: 0 additions & 1 deletion doc/ideas.rst
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,6 @@ This would be a way to specify less strict anchoring.
Make it possible to specify that the rightmost or leftmost match should be
picked. Default right now: Leftmost, even for -g adapters.

Allow ``A{15}`` to mean “``A`` repeated 15 times”.
Allow ``N{3,10}`` as in regular expressions (for a variable-length sequence).


Expand Down
1 change: 1 addition & 0 deletions doc/index.rst
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ Table of contents
installation
guide
colorspace
recipes
ideas
changes

Expand Down
83 changes: 83 additions & 0 deletions doc/recipes.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1,83 @@
=======
Recipes
=======

For some trimming applications, the pre-defined adapter types behave differently
from what you would like to have. In this section, we show some ways in which
cutadapt can be made to behave in the desired way.

.. note:: This section is still being written.


Forcing matches to be at the end of the read
--------------------------------------------

Use ``-a TACGGCATXXX``. The ``X`` is always counted as a mismatch and will force
the adapter match to be at the end. This is not the same as an anchored 3'
adapter since partial matches are still allowed.


Removing more than one adapter
------------------------------

If you want to remove more than one adapter, let's say a 5' adapter and a 3'
adapter, you have two options.

First, you can specify both adapters and also ``--times=2`` (or the short
version ``-n 2``). For example::

cutadapt -g ^TTAAGGCC -a TACGGACT -n 2 -o output.fastq input.fastq

This instructs cutadapt to run two rounds of adapter finding and removal. That
means that, after the first round and only when an adapter was actually found,
another round is performed. In both rounds, all given adapters (two in this
case) are searched and removed. The problem is that it could happen that one
adapter is found twice (so the 3' adapter, for example, could be removed twice).

The second option is to not use the ``-n`` option, but to run cutadapt twice,
first removing one adapter and then the other. It is easiest if you use a pipe
as in this example::

cutadapt -g ^TTAAGGCC input.fastq | cutadapt -a TACGGACT - > output.fastq


Trimming poly-A tails
---------------------

If you want to trim a poly-A tail from the 3' end of your reads, use the 3'
adapter type (``-a``) with an adapter sequence of many repeated ``A``
nucleotides. Starting with version 1.8 of cutadapt, you can use the
following notation to specify a sequence that consists of 100 ``A``::

cutadapt -a "A{100}" -o output.fastq input.fastq

This also works when there are sequencing errors in the poly-A tail. So this
read ::

TACGTACGTACGTACGAAATAAAAAAAAAAA

will be trimmed to::

TACGTACGTACGTACG

If for some reason you would like to use a shorter sequence of ``A``, you can
do so: The matching algorithm always picks the leftmost match that it can find,
so cutadapt will do the right thing even when the tail has more ``A`` than you
used in the adapter sequence. However, sequencing errors may result in shorter
matches than desired. For example, using ``-a "A{10}"``, the read above (where
the ``AAAT`` is followed by eleven ``A``) would be trimmed to::

TACGTACGTACGTACGAAAT

Depending on your application, perhaps a variant of ``-a A{10}N{90}`` is an
alternative, forcing the match to be located as much to the left as possible,
while still allowing for non-``A`` bases towards the end of the read.


Other things (unfinished)
-------------------------

* How to detect adapters
* Use cutadapt for quality-trimming only
* Use it for minimum/maximum length filtering
* Use it for conversion to FASTQ
25 changes: 24 additions & 1 deletion tests/testadapters.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
# coding: utf-8
from __future__ import print_function, division, absolute_import
from nose.tools import raises
from nose.tools import raises, assert_raises

from cutadapt.seqio import Sequence
from cutadapt.adapters import Adapter, AdapterMatch, ColorspaceAdapter, FRONT, BACK
Expand Down Expand Up @@ -67,3 +67,26 @@ def test_str():
def test_color():
ColorspaceAdapter('0123', where=FRONT, max_error_rate=0.1)


def test_parse_braces():
assert Adapter.parse_braces('') == ''
assert Adapter.parse_braces('A') == 'A'
assert Adapter.parse_braces('A{0}') == ''
assert Adapter.parse_braces('A{1}') == 'A'
assert Adapter.parse_braces('A{2}') == 'AA'
assert Adapter.parse_braces('A{2}C') == 'AAC'
assert Adapter.parse_braces('ACGTN{3}TGACCC') == 'ACGTNNNTGACCC'
assert Adapter.parse_braces('ACGTN{10}TGACCC') == 'ACGTNNNNNNNNNNTGACCC'
assert Adapter.parse_braces('ACGTN{3}TGA{4}CCC') == 'ACGTNNNTGAAAACCC'
assert Adapter.parse_braces('ACGTN{0}TGA{4}CCC') == 'ACGTTGAAAACCC'


def test_parse_braces_fail():
for expression in ['{', '}', '{}', '{5', '{1}', 'A{-7}', 'A{', 'A{1', 'N{7', 'AN{7', 'A{4{}',
'A{4}{3}', 'A{b}', 'A{6X}', 'A{X6}']:
print(expression)
try:
Adapter.parse_braces(expression)
except ValueError as e:
print(e)
assert_raises(ValueError, lambda: Adapter.parse_braces(expression))
11 changes: 11 additions & 0 deletions tests/tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -132,6 +132,10 @@ def test_polya():
'''poly-A tails'''
run("-m 24 -O 10 -a AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA", "polya.fasta", "polya.fasta")

def test_polya_brace_notation():
'''poly-A tails'''
run("-m 24 -O 10 -a A{35}", "polya.fasta", "polya.fasta")

def test_mask_adapter():
'''mask adapter with N (reads maintain the same length)'''
run("-b CAAG -n 3 --mask-adapter", "anywhere_repeat.fastq", "anywhere_repeat.fastq")
Expand Down Expand Up @@ -179,6 +183,13 @@ def test_literal_N():
def test_literal_N2():
run("-N -O 1 -g NNNNNNNNNNNNNN", "trimN5.fasta", "trimN5.fasta")

def test_literal_N_brace_notation():
'''test matching literal 'N's'''
run("-N -e 0.2 -a N{14}", "trimN3.fasta", "trimN3.fasta")

def test_literal_N2_brace_notation():
run("-N -O 1 -g N{14}", "trimN5.fasta", "trimN5.fasta")

def test_anchored_front():
run("-g ^FRONTADAPT -N", "anchored.fasta", "anchored.fasta")

Expand Down

0 comments on commit 5bc4505

Please sign in to comment.