Skip to content
This repository has been archived by the owner on Jan 13, 2022. It is now read-only.

Commit

Permalink
Scrappie seqmappy and implementation of local penalty in mapping rout…
Browse files Browse the repository at this point in the history
…ines
  • Loading branch information
tmassingham-ont committed Mar 17, 2018
1 parent f9d3e7f commit 2c0d9a1
Show file tree
Hide file tree
Showing 14 changed files with 501 additions and 83 deletions.
3 changes: 2 additions & 1 deletion CMakeLists.txt
Expand Up @@ -71,7 +71,7 @@ add_library (scrappie_objects OBJECT src/decode.c src/event_detection.c src/laye
set_property(TARGET scrappie_objects PROPERTY POSITION_INDEPENDENT_CODE 1)
add_library (scrappie_static STATIC $<TARGET_OBJECTS:scrappie_objects>)
set_target_properties(scrappie_static PROPERTIES OUTPUT_NAME scrappie CLEAN_DIRECT_OUTPUT 1)
add_executable (scrappie src/scrappie.c src/scrappie_raw.c src/scrappie_events.c src/scrappie_mappy.c src/scrappie_squiggle.c src/scrappie_subcommands.c src/scrappie_help.c src/fast5_interface.c)
add_executable (scrappie src/scrappie.c src/scrappie_raw.c src/scrappie_events.c src/scrappie_mappy.c src/scrappie_seqmappy.c src/scrappie_squiggle.c src/scrappie_subcommands.c src/scrappie_help.c src/fast5_interface.c)

if (BUILD_SHARED_LIB)
if (APPLE)
Expand Down Expand Up @@ -145,6 +145,7 @@ add_test(test_rawrgrgr_r94_call scrappie raw --model rgrgr_r94 ${USE_THREADS} ${
add_test(test_rawrgrgr_r95_call scrappie raw --model rgrgr_r95 ${USE_THREADS} ${READSDIR})
add_test(test_rawrnnrf_r94_call scrappie raw --model rnnrf_r94 ${USE_THREADS} ${READSDIR})
add_test(test_mappy scrappie mappy ${READSDIR}/${TESTREAD}.fa ${READSDIR}/${TESTREAD}.fast5)
add_test(test_seqmappy scrappie seqmappy ${READSDIR}/${TESTREAD}.fa ${READSDIR}/${TESTREAD}.fast5)
add_test(test_squiggle scrappie squiggle ${READSDIR}/test_squiggles.fa)
add_test(test_licence scrappie licence)
add_test(test_licence scrappie license)
Expand Down
5 changes: 3 additions & 2 deletions Makefile
@@ -1,8 +1,9 @@
buildDir ?= build
releaseType ?= Release

.PHONY: all
all: ${buildDir}/scrappie
.PHONY: all scrappie
all: scrappie
scrappie: ${buildDir}/scrappie

${buildDir}:
mkdir ${buildDir}
Expand Down
200 changes: 161 additions & 39 deletions src/decode.c

Large diffs are not rendered by default.

16 changes: 8 additions & 8 deletions src/decode.h
Expand Up @@ -35,15 +35,15 @@ float squiggle_match_viterbi(const raw_table signal, const_scrappie_matrix param


bool are_bounds_sane(int const * low, int const * high, size_t nblock, size_t seqlen);
float map_to_sequence_viterbi(const_scrappie_matrix logpost, float stay_pen,
float skip_pen, int const *seq, size_t seqlen, int * path);
float map_to_sequence_forward(const_scrappie_matrix logpost, float stay_pen,
float skip_pen, int const *seq, size_t seqlen);
float map_to_sequence_viterbi_banded(const_scrappie_matrix logpost, float stay_pen,
float skip_pen, int const *seq, size_t seqlen,
float map_to_sequence_viterbi(const_scrappie_matrix logpost, float stay_pen, float skip_pen,
float local_pen, int const *seq, size_t seqlen, int * path);
float map_to_sequence_forward(const_scrappie_matrix logpost, float stay_pen, float skip_pen,
float local_pen, int const *seq, size_t seqlen);
float map_to_sequence_viterbi_banded(const_scrappie_matrix logpost, float stay_pen, float skip_pen,
float local_pen, int const *seq, size_t seqlen,
int const * poslow, int const * poshigh);
float map_to_sequence_forward_banded(const_scrappie_matrix logpost, float stay_pen,
float skip_pen, int const *seq, size_t seqlen,
float map_to_sequence_forward_banded(const_scrappie_matrix logpost, float stay_pen, float skip_pen,
float local_pen, int const *seq, size_t seqlen,
int const * poslow, int const * poshigh);


Expand Down
3 changes: 3 additions & 0 deletions src/scrappie.c
Expand Up @@ -40,6 +40,9 @@ int main(int argc, char *argv[]) {
case SCRAPPIE_MODE_MAPPY:
ret = main_mappy(argc - 1, argv + 1);
break;
case SCRAPPIE_MODE_SEQMAPPY:
ret = main_seqmappy(argc - 1, argv + 1);
break;
default:
ret = EXIT_FAILURE;
warnx("Unrecognised subcommand %s\n", argv[1]);
Expand Down
4 changes: 4 additions & 0 deletions src/scrappie_help.c
Expand Up @@ -69,6 +69,10 @@ int main_help(int argc, char *argv[]) {
help_options[0] = argv[1];
ret = main_mappy(2, help_options);
break;
case SCRAPPIE_MODE_SEQMAPPY:
help_options[0] = argv[1];
ret = main_seqmappy(2, help_options);
break;
default:
ret = EXIT_FAILURE;
warnx("Unrecognised subcommand %s\n", argv[1]);
Expand Down
2 changes: 1 addition & 1 deletion src/scrappie_mappy.c
Expand Up @@ -151,7 +151,7 @@ static struct argp argp = { options, parse_arg, args_doc, doc };
static scrappie_matrix sequence_to_squiggle(char const * base_seq, size_t n, bool rescale){
RETURN_NULL_IF(NULL == base_seq, NULL);

int * sequence = encode_bases_to_integers(base_seq, n);
int * sequence = encode_bases_to_integers(base_seq, n, 1);
RETURN_NULL_IF(NULL == sequence, NULL);

scrappie_matrix squiggle = dna_squiggle(sequence, n, rescale);
Expand Down
28 changes: 19 additions & 9 deletions src/scrappie_seq_helpers.c
Expand Up @@ -4,6 +4,8 @@

#include "scrappie_seq_helpers.h"

static int nbase = 4;

/** Converts a nucleotide base into integer
*
* Input may be uppercase or (optionally) lowercase. Ambiguous
Expand Down Expand Up @@ -42,16 +44,24 @@ int base_to_int(char base, bool allow_lower){
* @returns Array [n] containing encoding of sequence or NULL if an
* invalid base was encountered
**/
int * encode_bases_to_integers(char const * seq, size_t n){
int * iseq = calloc(n, sizeof(int));
for(size_t i=0 ; i < n ; i++){
int ib = base_to_int(seq[i], true);
iseq[i] = ib;
if(-1 == ib){
free(iseq);
iseq = NULL;
break;
int * encode_bases_to_integers(char const * seq, size_t n, size_t state_len){
const size_t nstate = n - state_len + 1;

int * iseq = calloc(nstate, sizeof(int));
for(size_t i=0 ; i < nstate ; i++){
int ib = 0;
for(size_t j=0 ; j < state_len ; j++){
int newbase = base_to_int(seq[i + j], true);
if(-1 == newbase){
free(iseq);
iseq = NULL;
break;
}

ib *= nbase;
ib += newbase;
}
iseq[i] = ib;
}

return iseq;
Expand Down
2 changes: 1 addition & 1 deletion src/scrappie_seq_helpers.h
Expand Up @@ -5,6 +5,6 @@
#include <stdbool.h>

int base_to_int(char c, bool allow_lower);
int * encode_bases_to_integers(char const * seq, size_t n);
int * encode_bases_to_integers(char const * seq, size_t n, size_t state_len);

#endif /* SCRAPPIE_SEQ_HELPERS */
241 changes: 241 additions & 0 deletions src/scrappie_seqmappy.c
@@ -0,0 +1,241 @@
#include <math.h>
#include <stdio.h>
//#include <string.h>
#include <strings.h>
#include <sys/types.h>
#include <unistd.h>

#include "decode.h"
#include "fast5_interface.h"
#include "kseq.h"
#include "networks.h"
#include "scrappie_common.h"
#include "scrappie_licence.h"
#include "scrappie_seq_helpers.h"
#include "scrappie_stdlib.h"
#include "util.h"

KSEQ_INIT(int, read)

// Doesn't play nice with other headers, include last
#include <argp.h>


extern const char *argp_program_version;
extern const char *argp_program_bug_address;
static char doc[] = "Scrappie seqmappy (local-global)";
static char args_doc[] = "fasta fast5";
static struct argp_option options[] = {
{"localpen", 'l', "float", 0, "Penalty for local matching"},
{"min_prob", 'm', "probability", 0, "Minimum bound on probability of match"},
{"output", 'o', "filename", 0, "Write to file rather than stdout"},
{"prefix", 'p', "string", 0, "Prefix to append to name of read"},
{"segmentation", 3, "chunk:percentile", 0, "Chunk size and percentile for variance based segmentation"},
{"skip", 's', "penalty", 0, "Penalty for skipping a base"},
{"stay", 'y', "penalty", 0, "Penalty for staying"},
{"trim", 't', "start:end", 0, "Number of samples to trim, as start:end"},
{"licence", 10, 0, 0, "Print licensing information"},
{"license", 11, 0, OPTION_ALIAS, "Print licensing information"},
{0}
};

typedef struct {
size_t n;
char * seq;
char * name;
} scrappie_seq_t;

struct arguments {
float local_pen;
float min_prob;
float skip_pen;
float stay_pen;
FILE * output;
char * prefix;
int trim_start;
int trim_end;
int varseg_chunk;
float varseg_thresh;

char * fasta_file;
char * fast5_file;
};

static struct arguments args = {
.local_pen = 4.0f,
.min_prob = 1e-5f,
.stay_pen = 0.0f,
.skip_pen = 0.0f,
.output = NULL,
.prefix = "",
.trim_start = 200,
.trim_end = 10,
.varseg_chunk = 100,
.varseg_thresh = 0.0f,

.fasta_file = NULL,
.fast5_file = NULL
};

static error_t parse_arg(int key, char *arg, struct argp_state *state) {
switch (key) {
int ret = 0;
char * next_tok = NULL;
case 'l':
args.local_pen = atof(arg);
break;
case 'm':
args.min_prob = atof(arg);
break;
case 'o':
args.output = fopen(arg, "w");
if(NULL == args.output){
errx(EXIT_FAILURE, "Failed to open \"%s\" for output.", arg);
}
break;
case 'p':
args.prefix = arg;
break;
case 's':
args.skip_pen = atof(arg);
assert(isfinite(args.skip_pen));
break;
case 't':
args.trim_start = atoi(strtok(arg, ":"));
next_tok = strtok(NULL, ":");
if(NULL != next_tok){
args.trim_end = atoi(next_tok);
} else {
args.trim_end = args.trim_start;
}
assert(args.trim_start >= 0);
assert(args.trim_end >= 0);
break;
case 'y':
args.stay_pen = atof(arg);
assert(isfinite(args.stay_pen));
break;
case 3:
args.varseg_chunk = atoi(strtok(arg, ":"));
next_tok = strtok(NULL, ":");
if(NULL == next_tok){
errx(EXIT_FAILURE, "--segmentation should be of form chunk:percentile");
}
args.varseg_thresh = atof(next_tok) / 100.0;
assert(args.varseg_chunk >= 0);
assert(args.varseg_thresh > 0.0 && args.varseg_thresh < 1.0);
break;
case 10:
case 11:
ret = fputs(scrappie_licence_text, stdout);
exit((EOF != ret) ? EXIT_SUCCESS : EXIT_FAILURE);
break;

case ARGP_KEY_NO_ARGS:
argp_usage(state);
break;

case ARGP_KEY_ARG:
args.fasta_file = state->argv[state->next - 1];
if(NULL == state->argv[state->next]){
errx(EXIT_FAILURE, "fast5 file is a required argument");
}
args.fast5_file = state->argv[state->next];
state->next = state->argc;
break;

default:
return ARGP_ERR_UNKNOWN;
}
return 0;
}

static struct argp argp = { options, parse_arg, args_doc, doc };


static scrappie_seq_t read_sequence_from_fasta(char const * filename){
scrappie_seq_t seq = {0, NULL, NULL};

FILE * fh = fopen(filename, "r");
if(NULL == fh){
return seq;
}

kseq_t * kseqer = kseq_init(fileno(fh));
if(kseq_read(kseqer) >= 0){
char * name = calloc(kseqer->name.l + 1, sizeof(char));
char * base_seq = calloc(kseqer->seq.l, sizeof(char));
if(NULL == base_seq || NULL == name){
free(base_seq);
free(name);
} else {
seq.seq = strncpy(base_seq, kseqer->seq.s, kseqer->seq.l);
seq.name = strncpy(name, kseqer->name.s, kseqer->name.l);
seq.n = kseqer->seq.l;
}
}

kseq_destroy(kseqer);
fclose(fh);

return seq;
}


int main_seqmappy(int argc, char *argv[]) {
argp_parse(&argp, argc, argv, 0, 0, NULL);
if(NULL == args.output){
args.output = stdout;
}


// Open sequence file
scrappie_seq_t seq = read_sequence_from_fasta(args.fasta_file);
if(NULL == seq.seq){
warnx("Failed to open \"%s\" for input.\n", args.fasta_file);
return EXIT_FAILURE;
}

const size_t state_len = 5;
int * states = encode_bases_to_integers(seq.seq, seq.n, state_len);
const size_t nstate = seq.n - state_len + 1;

// Read raw signal and normalise
raw_table rt = read_raw(args.fast5_file, true);
rt = trim_and_segment_raw(rt, args.trim_start, args.trim_end, args.varseg_chunk, args.varseg_thresh);
if(NULL == rt.raw){
warnx("Failed to open \"%s\" for input and trim signal.\n", args.fasta_file);
return EXIT_FAILURE;
}
medmad_normalise_array(rt.raw + rt.start, rt.end - rt.start);


scrappie_matrix logpost = nanonet_rgrgr_r94_posterior(rt, args.min_prob, true);
if(NULL == logpost){
warnx("Failed to calculate posterior for \"%s\" ", args.fasta_file);
return EXIT_SUCCESS;
}

const size_t nblock = logpost->nc;
int * path = calloc(nblock, sizeof(int));
if(NULL != path){
float score = map_to_sequence_viterbi(logpost, args.stay_pen, args.skip_pen, args.local_pen, states, nstate, path);

fprintf(args.output, "# %s to %s -- score %f over %zu blocks (%f per block)\n", args.fast5_file, args.fasta_file, -score, nblock, -score / nblock);
fprintf(args.output, "block\tpos\n");
for(size_t i=0 ; i < nblock ; i++){
const int32_t pos = path[i];
fprintf(args.output, "%zu\t%d\n", i, pos);
}

free(path);
}


free(states);
logpost = free_scrappie_matrix(logpost);
free(seq.seq);
free(seq.name);

return EXIT_SUCCESS;
}
2 changes: 1 addition & 1 deletion src/scrappie_squiggle.c
Expand Up @@ -99,7 +99,7 @@ static struct argp argp = { options, parse_arg, args_doc, doc };
static scrappie_matrix sequence_to_squiggle(char const * base_seq, size_t n, bool rescale){
RETURN_NULL_IF(NULL == base_seq, NULL);

int * sequence = encode_bases_to_integers(base_seq, n);
int * sequence = encode_bases_to_integers(base_seq, n, 1);
RETURN_NULL_IF(NULL == sequence, NULL);

scrappie_matrix squiggle = dna_squiggle(sequence, n, rescale);
Expand Down

0 comments on commit 2c0d9a1

Please sign in to comment.