Skip to content

Commit

Permalink
feat: expose nextalign algorithm params in library
Browse files Browse the repository at this point in the history
  • Loading branch information
ivan-aksamentov committed Feb 9, 2021
1 parent ac44502 commit 5e3900a
Show file tree
Hide file tree
Showing 13 changed files with 187 additions and 162 deletions.
2 changes: 1 addition & 1 deletion .clang-tidy
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
---
Checks: "*,-modernize-use-trailing-return-type,-modernize-avoid-c-arrays,-llvmlibc*,-fuchsia*,-readability-magic-numbers,-hicpp*,-llvm-header-guard,-google-readability-todo,-OCUnusedMacroInspection"
Checks: "*,-modernize-use-trailing-return-type,-modernize-avoid-c-arrays,-llvmlibc*,-fuchsia*,-readability-magic-numbers,-hicpp*,-llvm-header-guard,-google-readability-todo,-OCUnusedMacroInspection,-altera-struct-pack-align,-readability-named-parameter,-readability-function-cognitive-complexity,-readability-isolate-declaration,-readability-uppercase-literal-suffix"
HeaderFilterRegex: '^(?!emscripten\/|system\/|3rdparty\/).*$'
AnalyzeTemporaryDtors: true
UseColor: true
Expand Down
1 change: 1 addition & 0 deletions packages/nextalign/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@ add_library(${PROJECT_NAME} STATIC
src/match/matchNuc.h
src/nextalign.cpp
src/nextalign_private.h
src/options.h
src/strip/stripInsertions.h
src/translate/decode.cpp
src/translate/decode.h
Expand Down
9 changes: 3 additions & 6 deletions packages/nextalign/benchmarks/src/AlignPairwise.benchmark.h
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
#include <numeric>
#include <vector>

#include "../../src/options.h"
#include "../include/nextalign/nextalign.h"
#include "../src/align/alignPairwise.h"
#include "../src/align/getGapOpenCloseScores.h"
Expand All @@ -16,11 +17,7 @@

class AlignPairwiseAverageBench : public benchmark::Fixture {
protected:
const NextalignOptions options = {
.gapOpenInFrame = -5,
.gapOpenOutOfFrame = -6,
.genes = {},
};
const NextalignOptions options = OPTIONS_DEFAULT;
const std::vector<int> gapOpenClose = getGapOpenCloseScoresCodonAware(ref, geneMap, options);
const NucleotideSequence ref = toNucleotideSequence(reference);
std::vector<NucleotideSequence> nucSequences;
Expand All @@ -44,7 +41,7 @@ BENCHMARK_DEFINE_F(AlignPairwiseAverageBench, Average)(benchmark::State& st) {
for (const auto _ : st) {
for (int i = 0; i < n; ++i) {
const auto& input = nucSequences[i];
benchmark::DoNotOptimize(aln = alignPairwise(input, ref, gapOpenClose, 100));
benchmark::DoNotOptimize(aln = alignPairwise(input, ref, gapOpenClose, options.alignment, options.seedNuc));
}
}

Expand Down
23 changes: 21 additions & 2 deletions packages/nextalign/include/nextalign/nextalign.h
Original file line number Diff line number Diff line change
Expand Up @@ -96,9 +96,28 @@ struct AlgorithmInput {
std::string seq;
};


struct NextalignSeedOptions {
int seedLength = 21; // --nuc-seed-length --aa-seed-length
int minSeeds = 10;
int seedSpacing = 100;
int mismatchesAllowed = 3;
};

struct NextalignAlignmentOptions {
int minimalLength;
int scoreGapExtend = 0;
int scoreGapOpen = -6;
int scoreGapOpenInFrame = -5; // --score-gap-open-in-frame
int scoreMismatch = -1; // --score-mismatch
int scoreMatch = 3; // --score-match
int maxIndel = 400;
};

struct NextalignOptions {
int gapOpenInFrame = -5;
int gapOpenOutOfFrame = -6;
NextalignAlignmentOptions alignment;
NextalignSeedOptions seedNuc;
NextalignSeedOptions seedAa;
std::set<std::string> genes;
};

Expand Down
84 changes: 42 additions & 42 deletions packages/nextalign/src/align/alignPairwise.cpp
Original file line number Diff line number Diff line change
@@ -1,11 +1,12 @@
#include "alignPairwise.h"

#include <algorithm>
#include <cmath>
#include <ctime>
#include <iostream>
#include <string>
#include <vector>

#include "alignPairwise.h"
#include "../alphabet/aminoacids.h"
#include "../alphabet/nucleotides.h"
#include "../match/matchAa.h"
Expand Down Expand Up @@ -37,14 +38,6 @@ class ErrorAlignmentBadSeedMatches : public std::runtime_error {
}
};


AlignmentParameters alignmentParameters = {
.gapExtend = 0,
.gapOpen = -6,
.misMatch = -1,
.match = 3,
};

// store direction info for backtrace as bits in paths matrix
// these indicate the currently optimal move
constexpr const int MATCH = 1 << 0;
Expand All @@ -61,7 +54,7 @@ constexpr const int END_OF_SEQUENCE = -1;
// part of the sequence and this is likely to produce errors for genomes with repeated sequence
template<typename Letter>
SeedMatch seedMatch(
const Sequence<Letter>& kmer, const Sequence<Letter>& ref, const int start_pos, const int allowed_mismatches) {
const Sequence<Letter>& kmer, const Sequence<Letter>& ref, const int start_pos, const int mismatchesAllowed) {
const int refSize = safe_cast<int>(ref.size());
const int kmerSize = safe_cast<int>(kmer.size());
int tmpScore = 0;
Expand All @@ -74,7 +67,7 @@ SeedMatch seedMatch(
tmpScore++;
}
// TODO: this speeds up seed-matching by disregarding bad seeds.
if (tmpScore + allowed_mismatches < pos) {
if (tmpScore + mismatchesAllowed < pos) {
break;
}
}
Expand Down Expand Up @@ -124,22 +117,26 @@ std::vector<int> getMapToGoodPositions(const Sequence<Letter>& query, int seedLe
}

template<typename Letter>
SeedAlignment seedAlignment(const Sequence<Letter>& query, const Sequence<Letter>& ref) {
SeedAlignment seedAlignment(
const Sequence<Letter>& query, const Sequence<Letter>& ref, const NextalignSeedOptions& options) {
const int querySize = safe_cast<int>(query.size());
const int refSize = safe_cast<int>(ref.size());

constexpr const int seedLength = 21;
constexpr const int allowed_mismatches = 3;
// clang-format off
const int nSeeds = refSize > options.minSeeds * options.seedSpacing ? refSize / options.seedSpacing : options.minSeeds;
const int margin = details::round(refSize / (nSeeds * 3.0f));
const int bandWidth = details::round((refSize + querySize) * 0.5f) - 3;
// clang-format on

const int nSeeds = refSize > 1000 ? details::round(refSize / 100.0) : 10;
const int margin = refSize > 10000 ? 30 : details::round(refSize / 300.0);
const int bandWidth = details::round((refSize + querySize) * 0.5) - 3;
int start_pos = 0;
if (bandWidth < 2 * seedLength) {
return {.meanShift = details::round((refSize - querySize) * 0.5), .bandWidth = bandWidth};
if (bandWidth < 2 * options.seedLength) {
return {
.meanShift = details::round((refSize - querySize) * 0.5f),// NOLINT: cppcoreguidelines-avoid-magic-numbers
.bandWidth = bandWidth //
};
}

const auto mapToGoodPositions = getMapToGoodPositions(query, seedLength);
const auto mapToGoodPositions = getMapToGoodPositions(query, options.seedLength);
const int nGoodPositions = mapToGoodPositions.size();

// TODO: Maybe use something other than array? A struct with named fields to make
Expand All @@ -153,10 +150,10 @@ SeedAlignment seedAlignment(const Sequence<Letter>& query, const Sequence<Letter

const int qPos = mapToGoodPositions[details::round(margin + (kmerSpacing * ni))];
// FIXME: query.substr() creates a new string. Use string view instead.
const auto tmpMatch = seedMatch(query.substr(qPos, seedLength), ref, start_pos, allowed_mismatches);
const auto tmpMatch = seedMatch(query.substr(qPos, options.seedLength), ref, start_pos, options.mismatchesAllowed);

// only use seeds with at most allowed_mismatches
if (tmpMatch.score >= seedLength - allowed_mismatches) {
if (tmpMatch.score >= options.seedLength - options.mismatchesAllowed) {
seedMatches.push_back({qPos, tmpMatch.shift, tmpMatch.shift - qPos, tmpMatch.score});
start_pos = tmpMatch.shift;
}
Expand Down Expand Up @@ -188,7 +185,8 @@ SeedAlignment seedAlignment(const Sequence<Letter>& query, const Sequence<Letter

template<typename Letter>
ForwardTrace scoreMatrix(const Sequence<Letter>& query, const Sequence<Letter>& ref,
const std::vector<int>& gapOpenClose, int bandWidth, int meanShift) {
const std::vector<int>& gapOpenClose, int bandWidth, int meanShift,
const NextalignAlignmentOptions& alignmentOptions) {
// allocate a matrix to record the matches
const int querySize = safe_cast<int>(query.size());
const int refSize = safe_cast<int>(ref.size());
Expand All @@ -213,20 +211,16 @@ ForwardTrace scoreMatrix(const Sequence<Letter>& query, const Sequence<Letter>&
// -> vertical step in the matrix from si+1 to si
// 2) if X is a base and Y is '-', rPos advances the same and the shift increases
// -> diagonal step in the matrix from (ri,si-1) to (ri+1,si)
const int gapExtend = alignmentParameters.gapExtend;
const int gapOpen = alignmentParameters.gapOpen;
const int misMatch = alignmentParameters.misMatch;
const int match = alignmentParameters.match;
const int NO_ALIGN = -(match - misMatch) * refSize;
const auto NO_ALIGN = -(alignmentOptions.scoreMatch - alignmentOptions.scoreMismatch) * refSize;

for (int si = 2 * bandWidth; si > bandWidth; si--) {
paths(si, 0) = qryGAPmatrix;
}
paths(bandWidth, 0) = MATCH;
qryGaps[bandWidth] = gapOpen;
qryGaps[bandWidth] = alignmentOptions.scoreGapOpen;
for (int si = bandWidth - 1; si >= 0; si--) {
paths(si, 0) = refGAPmatrix;
qryGaps[si] = gapOpen;
qryGaps[si] = alignmentOptions.scoreGapOpen;
}
for (int ri = 0; ri < refSize; ri++) {
int qPos = ri - (bandWidth + meanShift);
Expand All @@ -246,13 +240,14 @@ ForwardTrace scoreMatrix(const Sequence<Letter>& query, const Sequence<Letter>&
// if the shifted position is within the query sequence

// no gap -- match case
tmpMatch = lookupMatchScore(query[qPos], ref[ri]) > 0 ? match : misMatch;
tmpMatch =
lookupMatchScore(query[qPos], ref[ri]) > 0 ? alignmentOptions.scoreMatch : alignmentOptions.scoreMismatch;
score = scores(si, ri) + tmpMatch;
origin = MATCH;

// check the scores of a reference gap
if (si < 2 * bandWidth) {
rGapExtend = refGaps + gapExtend;
rGapExtend = refGaps + alignmentOptions.scoreGapExtend;
rGapOpen = scores(si + 1, ri + 1) + gapOpenClose[ri + 1];
if (rGapExtend > rGapOpen) {
tmpScore = rGapExtend;
Expand All @@ -271,7 +266,7 @@ ForwardTrace scoreMatrix(const Sequence<Letter>& query, const Sequence<Letter>&

// check the scores of a reference gap
if (si > 0) {
qGapExtend = qryGaps[si - 1] + gapExtend;
qGapExtend = qryGaps[si - 1] + alignmentOptions.scoreGapExtend;
qGapOpen = scores(si - 1, ri) + gapOpenClose[ri];
tmpScore = qGapExtend > qGapOpen ? qGapExtend : qGapOpen;
if (qGapExtend > qGapOpen) {
Expand Down Expand Up @@ -426,6 +421,8 @@ AlignmentResult<Letter> backTrace(const Sequence<Letter>& query, const Sequence<
}
}

// TODO: shrink to fit

std::reverse(aln_query.begin(), aln_query.end());
std::reverse(aln_ref.begin(), aln_ref.end());

Expand All @@ -441,20 +438,21 @@ struct AlignPairwiseTag {};

template<typename Letter>
AlignmentResult<Letter> alignPairwise(const Sequence<Letter>& query, const Sequence<Letter>& ref,
const std::vector<int>& gapOpenClose, int minimalLength, AlignPairwiseTag) {
const std::vector<int>& gapOpenClose, const NextalignAlignmentOptions& alignmentOptions,
const NextalignSeedOptions& seedOptions, AlignPairwiseTag) {
const int querySize = query.size();
if (querySize < minimalLength) {
if (querySize < alignmentOptions.minimalLength) {
throw ErrorAlignmentSequenceTooShort();
}

const SeedAlignment& seedAlignmentResult = seedAlignment(query, ref);
const SeedAlignment& seedAlignmentResult = seedAlignment(query, ref, seedOptions);
const auto& bandWidth = seedAlignmentResult.bandWidth;
const auto& meanShift = seedAlignmentResult.meanShift;

if (bandWidth > 400) {
if (bandWidth > alignmentOptions.maxIndel) {
throw ErrorAlignmentBadSeedMatches();
}
const ForwardTrace& forwardTrace = scoreMatrix(query, ref, gapOpenClose, bandWidth, meanShift);
const ForwardTrace& forwardTrace = scoreMatrix(query, ref, gapOpenClose, bandWidth, meanShift, alignmentOptions);
const auto& scores = forwardTrace.scores;
const auto& paths = forwardTrace.paths;

Expand All @@ -463,11 +461,13 @@ AlignmentResult<Letter> alignPairwise(const Sequence<Letter>& query, const Seque


NucleotideAlignmentResult alignPairwise(const NucleotideSequence& query, const NucleotideSequence& ref,
const std::vector<int>& gapOpenClose, int minimalLength) {
return alignPairwise(query, ref, gapOpenClose, minimalLength, AlignPairwiseTag{});
const std::vector<int>& gapOpenClose, const NextalignAlignmentOptions& alignmentOptions,
const NextalignSeedOptions& seedOptions) {
return alignPairwise(query, ref, gapOpenClose, alignmentOptions, seedOptions, AlignPairwiseTag{});
}

AminoacidAlignmentResult alignPairwise(const AminoacidSequence& query, const AminoacidSequence& ref,
const std::vector<int>& gapOpenClose, int minimalLength) {
return alignPairwise(query, ref, gapOpenClose, minimalLength, AlignPairwiseTag{});
const std::vector<int>& gapOpenClose, const NextalignAlignmentOptions& alignmentOptions,
const NextalignSeedOptions& seedOptions) {
return alignPairwise(query, ref, gapOpenClose, alignmentOptions, seedOptions, AlignPairwiseTag{});
}
26 changes: 11 additions & 15 deletions packages/nextalign/src/align/alignPairwise.h
Original file line number Diff line number Diff line change
Expand Up @@ -38,27 +38,23 @@ struct ForwardTrace {
vector2d<int> paths;
};


struct AlignmentParameters {
int gapExtend;
int gapOpen;
int misMatch;
int match;
};


template<typename Letter>
SeedAlignment seedAlignment(const std::string& query, const std::string& ref);
SeedAlignment seedAlignment(
const Sequence<Letter>& query, const Sequence<Letter>& ref, const NextalignSeedOptions& options);

template<typename Letter>
ForwardTrace scoreMatrix(const std::string& query, const std::string& ref, int bandWidth, int meanShift);
ForwardTrace scoreMatrix(const Sequence<Letter>& query, const Sequence<Letter>& ref,
const std::vector<int>& gapOpenClose, int bandWidth, int meanShift,
const NextalignAlignmentOptions& alignmentOptions);

template<typename Letter>
AlignmentResult<Letter> backTrace(const std::string& query, const std::string& ref, const vector2d<int>& scores,
const vector2d<int>& paths, int meanShift);
AlignmentResult<Letter> backTrace(const Sequence<Letter>& query, const Sequence<Letter>& ref,
const vector2d<int>& scores, const vector2d<int>& paths, int meanShift);

NucleotideAlignmentResult alignPairwise(const NucleotideSequence& query, const NucleotideSequence& ref,
const std::vector<int>& gapOpenClose, int minimalLength);
const std::vector<int>& gapOpenClose, const NextalignAlignmentOptions& alignmentOptions,
const NextalignSeedOptions& seedOptions);

AminoacidAlignmentResult alignPairwise(const AminoacidSequence& query, const AminoacidSequence& ref,
const std::vector<int>& gapOpenClose, int minimalLength);
const std::vector<int>& gapOpenClose, const NextalignAlignmentOptions& alignmentOptions,
const NextalignSeedOptions& seedOptions);
6 changes: 3 additions & 3 deletions packages/nextalign/src/align/getGapOpenCloseScores.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -18,8 +18,8 @@ std::vector<int> getGapOpenCloseScoresFlat(//
/* in */ const NucleotideSequence& ref, //
/* in */ const NextalignOptions& options //
) {
std::vector<int> gapOpenClose(ref.size()+2);
std::fill(gapOpenClose.begin(), gapOpenClose.end(), options.gapOpenOutOfFrame);
std::vector<int> gapOpenClose(ref.size() + 2);
std::fill(gapOpenClose.begin(), gapOpenClose.end(), options.alignment.scoreGapOpen);
return gapOpenClose;
}

Expand All @@ -41,7 +41,7 @@ std::vector<int> getGapOpenCloseScoresCodonAware(//

// TODO: might use std::fill()
for (int i = gene.start; i <= gene.end; i += 3) {
gapOpenClose[i] = options.gapOpenInFrame;
gapOpenClose[i] = options.alignment.scoreGapOpenInFrame;
}
}

Expand Down
2 changes: 1 addition & 1 deletion packages/nextalign/src/nextalign.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ NextalignResult nextalign(const NucleotideSequence& query, const NucleotideSeque
const auto gapOpenCloseNuc = getGapOpenCloseScoresCodonAware(ref, geneMap, options);
const auto gapOpenCloseAA = getGapOpenCloseScoresFlat(ref, options);

const auto alignment = alignPairwise(query, ref, gapOpenCloseNuc, 100);
const auto alignment = alignPairwise(query, ref, gapOpenCloseNuc, options.alignment, options.seedNuc);

std::vector<Peptide> queryPeptides;
std::vector<Peptide> refPeptides;
Expand Down
31 changes: 31 additions & 0 deletions packages/nextalign/src/options.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
#pragma once

#include <nextalign/nextalign.h>

inline const NextalignOptions OPTIONS_DEFAULT = {
.alignment =
{
.minimalLength = 100,
.scoreGapExtend = 0,
.scoreGapOpen = -6,
.scoreGapOpenInFrame = -5,
.scoreMismatch = -1,
.scoreMatch = 3,
.maxIndel = 400,
},
.seedNuc =
{
.seedLength = 21,
.minSeeds = 10,
.seedSpacing = 100,
.mismatchesAllowed = 3,
},
.seedAa =
{
.seedLength = 21,
.minSeeds = 10,
.seedSpacing = 100,
.mismatchesAllowed = 3,
},
.genes = {},
};
2 changes: 1 addition & 1 deletion packages/nextalign/src/translate/translateGenes.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -74,7 +74,7 @@ PeptidesInternal translateGenes( //
const auto& queryGene = extractGeneQuery(query, gene, coordMap);
const auto queryPeptide = translate(queryGene);

const auto geneAlignment = alignPairwise(queryPeptide, refPeptide, gapOpenCloseAA, 10);
const auto geneAlignment = alignPairwise(queryPeptide, refPeptide, gapOpenCloseAA, options.alignment, options.seedAa);
const auto stripped = stripInsertions(geneAlignment.ref, geneAlignment.query);


Expand Down
Loading

0 comments on commit 5e3900a

Please sign in to comment.