marcelm · rhpvorderman · Mar 25, 2022 · Mar 23, 2022 · Mar 23, 2022 · Mar 23, 2022
diff --git a/generate_conversion_tables.py b/generate_conversion_tables.py
@@ -0,0 +1,73 @@
+import io
+
+
+def nucleotide_complements_table():
+    # A nice list of complements can be found at:
+    # http://www.reverse-complement.com/ambiguity.html
+    table = list(range(256))
+
+    table[ord('A')] = "'T'"
+    table[ord('a')] = "'t'"
+    table[ord('C')] = "'G'"
+    table[ord('c')] = "'g'"
+    table[ord('G')] = "'C'"
+    table[ord('g')] = "'c'"
+    table[ord('T')] = "'A'"
+    table[ord('t')] = "'a'"
+
+    table[ord('U')] = "'A'"
+    table[ord('u')] = "'a'"
+
+    # R, purine (A, G) vs Y, pyrimidine (C, T)
+    table[ord('R')] = "'Y'"
+    table[ord('r')] = "'y'"
+    table[ord('Y')] = "'R'"
+    table[ord('y')] = "'r'"
+
+    # K, keto (G, T) vs A, amino (A, C)
+    table[ord('K')] = "'M'"
+    table[ord('k')] = "'m'"
+    table[ord('M')] = "'K'"
+    table[ord('m')] = "'k'"
+
+    # B, not A, vs V, not T
+    table[ord('B')] = "'V'"
+    table[ord('b')] = "'v'"
+    table[ord('V')] = "'B'"
+    table[ord('v')] = "'b'"
+
+    # D, not C vs H, not G
+    table[ord('D')] = "'H'"
+    table[ord('d')] = "'h'"
+    table[ord('H')] = "'D'"
+    table[ord('h')] = "'d'"
+
+    # S, W and N's complements are the same. So they are not explicitly
+    # included above.
+    return table
+
+
+def make_table(variable_name, table, row_size=16):
+    out = io.StringIO()
+    out.write(variable_name + ' = {\n    ')
+    i = 0
+    for i, literal in enumerate(table):
+        if i % row_size == 0 and i != 0:
+            out.write("\n    ")
+        out.write(str(literal).rjust(3, " ") + ", ")
+    out.write("\n")
+    out.write("};\n")
+    return out.getvalue()
+
+
+def main():
+    with open("src/dnaio/_conversions.h", "wt", encoding="utf-8") as out:
+        out.write(make_table(
+            "static const char NUCLEOTIDE_COMPLEMENTS[256]",
+            nucleotide_complements_table())
+        )
+        out.write('\n')
+
+
+if __name__ == "__main__":
+    main()
diff --git a/src/dnaio/_conversions.h b/src/dnaio/_conversions.h
@@ -0,0 +1,19 @@
+static const char NUCLEOTIDE_COMPLEMENTS[256] = {
+      0,   1,   2,   3,   4,   5,   6,   7,   8,   9,  10,  11,  12,  13,  14,  15, 
+     16,  17,  18,  19,  20,  21,  22,  23,  24,  25,  26,  27,  28,  29,  30,  31, 
+     32,  33,  34,  35,  36,  37,  38,  39,  40,  41,  42,  43,  44,  45,  46,  47, 
+     48,  49,  50,  51,  52,  53,  54,  55,  56,  57,  58,  59,  60,  61,  62,  63, 
+     64, 'T', 'V', 'G', 'H',  69,  70, 'C', 'D',  73,  74, 'M',  76, 'K',  78,  79, 
+     80,  81, 'Y',  83, 'A', 'A', 'B',  87,  88, 'R',  90,  91,  92,  93,  94,  95, 
+     96, 't', 'v', 'g', 'h', 101, 102, 'c', 'd', 105, 106, 'm', 108, 'k', 110, 111, 
+    112, 113, 'y', 115, 'a', 'a', 'b', 119, 120, 'r', 122, 123, 124, 125, 126, 127, 
+    128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 
+    144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 
+    160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 
+    176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 
+    192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 
+    208, 209, 210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223, 
+    224, 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239, 
+    240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253, 254, 255, 
+};
+
diff --git a/src/dnaio/_core.pyi b/src/dnaio/_core.pyi
@@ -13,6 +13,7 @@ class SequenceRecord:
     def qualities_as_bytes(self) -> bytes: ...
     def fastq_bytes(self, two_headers: bool = ...) -> bytes: ...
     def is_mate(self, other: SequenceRecord) -> bool: ...
+    def reverse_complement(self) -> SequenceRecord: ...
 
 class BytesSequenceRecord:
     name: bytes
@@ -26,6 +27,7 @@ class BytesSequenceRecord:
     def __richcmp__(self, other: BytesSequenceRecord, op: int) -> bool: ...
     def fastq_bytes(self, two_headers: bool = ...) -> bytes: ...
     def is_mate(self, other: BytesSequenceRecord) -> bool: ...
+    def reverse_complement(self) -> BytesSequenceRecord: ...
 
 # Bytestring = Union[bytes, bytearray, memoryview]. Technically just 'bytes' is
 # acceptable as an alias, but even more technically this function supports all

diff --git a/src/dnaio/_core.pyx b/src/dnaio/_core.pyx
@@ -10,12 +10,16 @@ cimport cython
 
 cdef extern from "Python.h":
     unsigned char * PyUnicode_1BYTE_DATA(object o)
+    void * PyUnicode_DATA(object o)
     bint PyUnicode_IS_COMPACT_ASCII(object o)
     object PyUnicode_New(Py_ssize_t size, Py_UCS4 maxchar)
 
 cdef extern from "ascii_check.h":
     int string_is_ascii(char * string, size_t length)
 
+cdef extern from "_conversions.h":
+    const char NUCLEOTIDE_COMPLEMENTS[256]
+
 from .exceptions import FastqFormatError
 from ._util import shorten
 
@@ -28,6 +32,25 @@ def bytes_ascii_check(bytes string, Py_ssize_t length = -1):
     cdef bint ascii = string_is_ascii(PyBytes_AS_STRING(string), length)
     return ascii
 
+cdef void _reverse(char *src, char *dest, Py_ssize_t length):
+    cdef Py_ssize_t cursor
+    cdef Py_ssize_t reverse_cursor = length
+    for cursor in range(length):
+        reverse_cursor -= 1
+        dest[reverse_cursor] = src[cursor]
+    return
+
+
+cdef void _reverse_complement(char *src, char *dest, Py_ssize_t length):
+    cdef Py_ssize_t cursor
+    cdef Py_ssize_t reverse_cursor = length
+    cdef unsigned char nucleotide
+    for cursor in range(length):
+        reverse_cursor -= 1
+        nucleotide = src[cursor]
+        dest[reverse_cursor] = NUCLEOTIDE_COMPLEMENTS[nucleotide]
+    return
+
 
 cdef class SequenceRecord:
     """
@@ -231,6 +254,19 @@ cdef class SequenceRecord:
             char * header2_chars = <char *>PyUnicode_1BYTE_DATA(other._name)
         return record_ids_match(header1_chars, header2_chars, header1_length)
 
+    def reverse_complement(self):
+        name = self._name
+        cdef Py_ssize_t sequence_length = PyUnicode_GET_LENGTH(self._sequence)
+        cdef object reversed_sequence = PyUnicode_New(sequence_length, 127)
+        cdef object reversed_qualities = PyUnicode_New(sequence_length, 127)
+        _reverse_complement(<char *>PyUnicode_DATA(self._sequence),
+                            <char *>PyUnicode_DATA(reversed_sequence),
+                            sequence_length)
+        _reverse(<char *>PyUnicode_DATA(self._qualities),
+                 <char *>PyUnicode_DATA(reversed_qualities),
+                 sequence_length)
+        return SequenceRecord.__new__(SequenceRecord, name, reversed_sequence, reversed_qualities)
+
 
 cdef class BytesSequenceRecord:
     """
@@ -391,6 +427,19 @@ cdef class BytesSequenceRecord:
                                 PyBytes_AS_STRING(other._name),
                                 PyBytes_GET_SIZE(self._name))
 
+    def reverse_complement(self):
+        name = self._name
+        cdef Py_ssize_t sequence_length = PyBytes_GET_SIZE(self._sequence)
+        cdef object reversed_sequence = PyBytes_FromStringAndSize(NULL, sequence_length)
+        cdef object reversed_qualities = PyBytes_FromStringAndSize(NULL, sequence_length)
+        _reverse_complement(<char *>PyBytes_AS_STRING(self._sequence),
+                            <char *>PyBytes_AS_STRING(reversed_sequence),
+                            sequence_length)
+        _reverse(<char *>PyBytes_AS_STRING(self._qualities),
+                 <char *>PyBytes_AS_STRING(reversed_qualities),
+                 sequence_length)
+        return BytesSequenceRecord.__new__(BytesSequenceRecord, name, reversed_sequence, reversed_qualities)
+
 
 cdef bytes create_fastq_record(
     char * name,

diff --git a/tests/test_records.py b/tests/test_records.py
@@ -21,6 +21,15 @@ def test_fastq_bytes_two_headers(self):
     def test_is_mate_succes(self):
         assert SequenceRecord("name1", "A", "=").is_mate(SequenceRecord("name2", "GC", "FF"))
 
+    def test_reverse_complement(self):
+        assert SequenceRecord("name1",
+                              "ACGTUMRWSYKVHDBNacgtumrwsykvhdbn",
+                              "/AAAA/6E/EEEEEEEEEEEE/EEEEA///E/"
+                              ).reverse_complement() == \
+               SequenceRecord("name1",
+                              "nvhdbmrswykaacgtNVHDBMRSWYKAACGT",
+                              "/E///AEEEE/EEEEEEEEEEEE/E6/AAAA/")
+
     def test_init_name_bad(self):
         with pytest.raises(ValueError) as error:
             SequenceRecord("nąme1", "A", "=")
@@ -127,6 +136,15 @@ def test_is_mate_succes(self):
         assert BytesSequenceRecord(b"name1", b"A", b"=").is_mate(
             BytesSequenceRecord(b"name2", b"GC", b"FF"))
 
+    def test_reverse_complement(self):
+        assert BytesSequenceRecord(b"name1",
+                                   b"ACGTUMRWSYKVHDBNacgtumrwsykvhdbn",
+                                   b"/AAAA/6E/EEEEEEEEEEEE/EEEEA///E/"
+                                   ).reverse_complement() == \
+               BytesSequenceRecord(b"name1",
+                                   b"nvhdbmrswykaacgtNVHDBMRSWYKAACGT",
+                                   b"/E///AEEEE/EEEEEEEEEEEE/E6/AAAA/")
+
     def test_init_name_none(self):
         with pytest.raises(TypeError) as error:
             BytesSequenceRecord(None, b"A", b"=")