Skip to content

Commit

Permalink
fixed gtrfinder
Browse files Browse the repository at this point in the history
  • Loading branch information
lmdu committed Oct 15, 2023
1 parent daf39e2 commit b70a2c5
Show file tree
Hide file tree
Showing 8 changed files with 74 additions and 31 deletions.
4 changes: 3 additions & 1 deletion docs/api_reference.rst
Original file line number Diff line number Diff line change
Expand Up @@ -48,14 +48,16 @@ pytrf.STRFinder
pytrf.GTRFinder
---------------

.. py:class:: pytrf.GTRFinder(chrom, seq, max_motif=30, min_repeat=3, min_length=10)
.. py:class:: pytrf.GTRFinder(chrom, seq, min_motif=1, max_motif=100, min_repeat=3, min_length=10)
Find all exact or perfect generic tandem repeats (GTRs) that meet the minimum repeat and minimum length on the input sequence

:param str chrom: the sequence name

:param str seq: the input DNA sequence

:param int min_motif: minimum length of motif sequence

:param int max_motif: maximum length of motif sequence

:param int min_repeat: minimum number of tandem repeats
Expand Down
2 changes: 1 addition & 1 deletion docs/changelog.rst
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ Changelog

Version 1.2.0 (2023-10-15)
--------------------------

- Fixed repeat search start position
- Optimized atr finder algorithm

Version 1.1.0 (2023-09-16)
Expand Down
14 changes: 7 additions & 7 deletions src/etr.c
Original file line number Diff line number Diff line change
Expand Up @@ -18,18 +18,18 @@ static void pytrf_etr_dealloc(pytrf_ETR *self) {

static PyObject* pytrf_etr_repr(pytrf_ETR *self) {
return PyUnicode_FromFormat("<ETR> (%S)%d @ %S:%zd-%zd", self->motif,
self->repeats, self->seqid, self->start, self->end);
self->repeat, self->seqid, self->start, self->end);
}

static PyObject* pytrf_etr_as_list(pytrf_ETR *self) {
return Py_BuildValue("OnnOiii", self->seqid, self->start, self->end, self->motif,
self->mlen, self->repeats, self->length);
self->mlen, self->repeat, self->length);
}

static PyObject* pytrf_etr_as_dict(pytrf_ETR *self) {
return Py_BuildValue("{s:O,s:n,s:n,s:O,s:i,s:i,s:i}", "chrom", self->seqid,
"start", self->start, "end", self->end, "motif", self->motif,
"type", self->mlen, "repeats", self->repeats, "length", self->length);
"type", self->mlen, "repeat", self->repeat, "length", self->length);
}

static PyObject* pytrf_etr_as_gff(pytrf_ETR *self, PyObject *args, PyObject *kwargs) {
Expand All @@ -40,8 +40,8 @@ static PyObject* pytrf_etr_as_gff(pytrf_ETR *self, PyObject *args, PyObject *kwa
return NULL;
}

return PyUnicode_FromFormat("%S\tpytrf\tETR\t%zd\t%zd\t.\t+\t.\tMotif=%S;Type=%d;Repeats=%d;Length=%d%s",
self->seqid, self->start, self->end, self->motif, self->mlen, self->repeats,
return PyUnicode_FromFormat("%S\tpytrf\tETR\t%zd\t%zd\t.\t+\t.\tMotif=%S;Type=%d;Repeat=%d;Length=%d%s",
self->seqid, self->start, self->end, self->motif, self->mlen, self->repeat,
self->length, terminator);
}

Expand All @@ -56,7 +56,7 @@ static PyObject* pytrf_etr_as_string(pytrf_ETR *self, PyObject *args, PyObject *

return PyUnicode_FromFormat("%S%s%zd%s%zd%s%S%s%d%s%d%s%d%s", self->seqid, separator,
self->start, separator, self->end, separator, self->motif,
separator, self->mlen, separator, self->repeats, separator,
separator, self->mlen, separator, self->repeat, separator,
self->length, terminator);
}

Expand All @@ -83,7 +83,7 @@ static PyMemberDef pytrf_etr_members[] = {
{"end", T_PYSSIZET, offsetof(pytrf_ETR, end), READONLY},
{"motif", T_OBJECT, offsetof(pytrf_ETR, motif), READONLY},
{"type", T_INT, offsetof(pytrf_ETR, mlen), READONLY},
{"repeats", T_INT, offsetof(pytrf_ETR, repeats), READONLY},
{"repeat", T_INT, offsetof(pytrf_ETR, repeat), READONLY},
{"length", T_INT, offsetof(pytrf_ETR, length), READONLY},
{NULL}
};
Expand Down
2 changes: 1 addition & 1 deletion src/etr.h
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ typedef struct {
int mlen;

//number of tandem repeats
int repeats;
int repeat;

//tandem repeat element length
int length;
Expand Down
69 changes: 52 additions & 17 deletions src/gtr.c
Original file line number Diff line number Diff line change
Expand Up @@ -11,26 +11,51 @@
#include "compat.h"
#include "structmember.h"

static int is_redundant_motif(char *s, int l, int m) {
int i, j, b;

if (m == 1) {
return 0;
}

for (j = 1; j <= m; ++j) {
b = l - j;
i = 0;

while ((i < b) && (s[i] == s[i+j])) {
++i;
}

if (i == b) {
return 1;
}
}

return 0;
}

static PyObject* pytrf_gtrfinder_new(PyTypeObject *type, PyObject *args, PyObject *kwargs) {
int i;

static char* keywords[] = {"chrom", "seq", "max_motif", "min_repeat", "min_length", NULL};
static char* keywords[] = {"chrom", "seq", "min_motif", "max_motif", "min_repeat", "min_length", NULL};

pytrf_GTRFinder *obj = (pytrf_GTRFinder *)type->tp_alloc(type, 0);
if (!obj) return NULL;

obj->max_motif = 30;
obj->min_motif = 1;
obj->max_motif = 100;
obj->min_repeat = 3;
obj->min_length = 10;

//initialize start search position
obj->next_start = 0;

if (!PyArg_ParseTupleAndKeywords(args, kwargs, "OO|iii", keywords, &obj->seqname, &obj->seqobj, &obj->max_motif, &obj->min_repeat, &obj->min_length)) {
if (!PyArg_ParseTupleAndKeywords(args, kwargs, "OO|iiii", keywords, &obj->seqname, &obj->seqobj, &obj->min_motif, &obj->max_motif, &obj->min_repeat, &obj->min_length)) {
return NULL;
}

obj->seq = PyUnicode_AsUTF8AndSize(obj->seqobj, &obj->size);
obj->motif = (char *)malloc(obj->max_motif + 1);

obj->boundary = (Py_ssize_t *)malloc(sizeof(Py_ssize_t) * (obj->max_motif+1));
for (i = 0; i <= obj->max_motif; ++i) {
Expand All @@ -48,6 +73,8 @@ static void pytrf_gtrfinder_dealloc(pytrf_GTRFinder *self) {
free(self->boundary);
}

free(self->motif);

self->seq = NULL;
Py_DECREF(self->seqname);
Py_DECREF(self->seqobj);
Expand Down Expand Up @@ -87,7 +114,7 @@ static PyObject* pytrf_gtrfinder_next(pytrf_GTRFinder *self) {
}

cs = i;
for (j = 1; j <= self->max_motif; ++j) {
for (j = self->min_motif; j <= self->max_motif; ++j) {
b = self->boundary[j];

while ((i < b) && (self->seq[i] == self->seq[i+j])) {
Expand All @@ -99,16 +126,24 @@ static PyObject* pytrf_gtrfinder_next(pytrf_GTRFinder *self) {
rl = rn * j;

if (rn >= self->min_repeat && rl >= self->min_length) {
memcpy(self->motif, self->seq+cs, j);
self->motif[j] = '\0';

if (is_redundant_motif(self->motif, j, self->min_motif)) {
i = cs;
continue;
}

pytrf_ETR *gtr = PyObject_New(pytrf_ETR, &pytrf_ETRType);

gtr->mlen = j;
gtr->repeats = rn;
gtr->repeat = rn;
gtr->length = rl;
gtr->start = cs + 1;
gtr->end = cs + rl;
gtr->seqid = Py_NewRef(self->seqname);
gtr->seqobj = Py_NewRef(self->seqobj);
gtr->motif = PyUnicode_Substring(self->seqobj, cs, cs + j);
gtr->motif = PyUnicode_FromString(self->motif);

self->next_start = gtr->end;
return (PyObject *)gtr;
Expand Down Expand Up @@ -141,21 +176,16 @@ static PyObject* pytrf_gtrfinder_as_list(pytrf_GTRFinder *self) {
//boundary
Py_ssize_t b;

//motif cache
char *motif;

PyObject *gtrs = PyList_New(0);
PyObject *tmp;

motif = (char *)malloc(self->max_motif + 1);

for (i = 0; i < self->size; ++i) {
if (self->seq[i] == 78) {
continue;
}

cs = i;
for (j = 1; j <= self->max_motif; ++j) {
for (j = self->min_motif; j <= self->max_motif; ++j) {
b = self->boundary[j];

while ((i < b) && (self->seq[i] == self->seq[i+j])) {
Expand All @@ -167,12 +197,18 @@ static PyObject* pytrf_gtrfinder_as_list(pytrf_GTRFinder *self) {
rl = rn * j;

if (rn >= self->min_repeat && rl >= self->min_length) {
memcpy(motif, self->seq+cs, j);
motif[j] = '\0';
memcpy(self->motif, self->seq+cs, j);
self->motif[j] = '\0';

if (is_redundant_motif(self->motif, j, self->min_motif)) {
i = cs;
continue;
}

gs = cs + 1;
ge = cs + rl;
tmp = Py_BuildValue("Onnsiii", self->seqname, gs, ge, motif, j, rn, rl);

tmp = Py_BuildValue("Onnsiii", self->seqname, gs, ge, self->motif, j, rn, rl);
PyList_Append(gtrs, tmp);
Py_DECREF(tmp);

Expand All @@ -184,7 +220,6 @@ static PyObject* pytrf_gtrfinder_as_list(pytrf_GTRFinder *self) {
}
}

free(motif);
return gtrs;
}

Expand Down
6 changes: 6 additions & 0 deletions src/gtr.h
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,9 @@ typedef struct {
//pointer to sequence object
const char *seq;

//min motif length
int min_motif;

//max motif length
int max_motif;

Expand All @@ -40,6 +43,9 @@ typedef struct {
//boundary
Py_ssize_t *boundary;

//motif
char *motif;

} pytrf_GTRFinder;

extern PyTypeObject pytrf_GTRFinderType;
Expand Down
4 changes: 2 additions & 2 deletions src/str.c
Original file line number Diff line number Diff line change
Expand Up @@ -103,8 +103,8 @@ static PyObject* pytrf_strfinder_next(pytrf_STRFinder *self) {
pytrf_ETR *ssr = PyObject_New(pytrf_ETR, &pytrf_ETRType);

ssr->mlen = j;
ssr->repeats = rl/j;
ssr->length = ssr->repeats * j;
ssr->repeat = rl/j;
ssr->length = ssr->repeat * j;
ssr->start = cs + 1;
ssr->end = cs + ssr->length;
ssr->seqid = Py_NewRef(self->seqname);
Expand Down
4 changes: 2 additions & 2 deletions test.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,9 +6,9 @@
for s in fa:
pass

atrs = pytrf.ATRFinder(s.name, s.seq, min_motif_size=10, max_motif_size=100, min_seed_repeat=2, max_consecutive_error=3)
atrs = pytrf.GTRFinder(s.name, s.seq, min_motif=10, max_motif=100)

for atr in atrs:
atr.as_string()
print(atr.as_string())

#print(len(atrs.as_list()))

0 comments on commit b70a2c5

Please sign in to comment.