Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: skip seed match for aa and infer alignment params from nuc alignment #613

Merged
merged 12 commits into from Nov 24, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
2 changes: 1 addition & 1 deletion .clang-tidy
@@ -1,5 +1,5 @@
---
Checks: '*,-OCUnusedMacroInspection,-readability-identifier-length,-bugprone-easily-swappable-parameters,-altera-struct-pack-align,-altera-unroll-loops,-cppcoreguidelines-pro-bounds-array-to-pointer-decay,-cppcoreguidelines-pro-type-vararg,-fuchsia*,-google-readability-todo,-google-runtime-references,-hicpp*,-llvm-header-guard,-llvmlibc*,-modernize-avoid-c-arrays,-modernize-use-nodiscard,-modernize-use-trailing-return-type,-readability-convert-member-functions-to-static,-readability-function-cognitive-complexity,-readability-implicit-bool-conversion,-readability-isolate-declaration,-readability-magic-numbers,-readability-named-parameter,-readability-uppercase-literal-suffix,-readability-identifier-length,-bugprone-easily-swappable-parameters,-altera-id-dependent-backward-branch'
Checks: '*,-OCUnusedMacroInspection,-readability-identifier-length,-bugprone-easily-swappable-parameters,-altera-struct-pack-align,-altera-unroll-loops,-cppcoreguidelines-pro-bounds-array-to-pointer-decay,-cppcoreguidelines-pro-type-vararg,-fuchsia*,-google-readability-todo,-google-runtime-references,-hicpp*,-llvm-header-guard,-llvmlibc*,-modernize-avoid-c-arrays,-modernize-use-nodiscard,-modernize-use-trailing-return-type,-readability-convert-member-functions-to-static,-readability-function-cognitive-complexity,-readability-implicit-bool-conversion,-readability-isolate-declaration,-readability-magic-numbers,-readability-named-parameter,-readability-uppercase-literal-suffix,-readability-identifier-length,-bugprone-easily-swappable-parameters,-altera-id-dependent-backward-branch,-readability-avoid-const-params-in-decls'
HeaderFilterRegex: '^(?!emscripten\/|boost\/|system\/|3rdparty\/).*$'
AnalyzeTemporaryDtors: true
...
2 changes: 2 additions & 0 deletions packages/nextalign/CMakeLists.txt
Expand Up @@ -21,6 +21,8 @@ add_library(${PROJECT_NAME} STATIC
include/nextalign/private/nextalign_private.h
src/align/alignPairwise.cpp
src/align/alignPairwise.h
src/align/alignmentParams.cpp
src/align/alignmentParams.h
src/align/getGapOpenCloseScores.cpp
src/align/getGapOpenCloseScores.h
src/alphabet/aminoacids.cpp
Expand Down
38 changes: 20 additions & 18 deletions packages/nextalign/src/align/alignPairwise.cpp
Expand Up @@ -478,7 +478,7 @@ struct AlignPairwiseTag {};
template<typename Letter>
AlignmentStatus<Letter> alignPairwise(const Sequence<Letter>& query, const Sequence<Letter>& ref,
const std::vector<int>& gapOpenClose, const NextalignAlignmentOptions& alignmentOptions,
const NextalignSeedOptions& seedOptions, AlignPairwiseTag) {
const NextalignSeedOptions& seedOptions, int bandWidth, int shift, AlignPairwiseTag) {

debug_trace(
"Align pairwise: started:\n minimalLength={:},\n penaltyGapExtend={:},\n penaltyGapOpen={:},\n "
Expand All @@ -489,18 +489,6 @@ AlignmentStatus<Letter> alignPairwise(const Sequence<Letter>& query, const Seque
alignmentOptions.scoreMatch, alignmentOptions.maxIndel, seedOptions.seedLength, seedOptions.minSeeds,
seedOptions.seedSpacing, seedOptions.mismatchesAllowed);

const SeedAlignmentStatus& seedAlignmentStatus = seedAlignment(query, ref, seedOptions);
if (seedAlignmentStatus.status != Status::Success) {
return AlignmentStatus<Letter>{
.status = seedAlignmentStatus.status,
.error = seedAlignmentStatus.error,
.result = {},
};
}

const auto& bandWidth = seedAlignmentStatus.result->bandWidth;
const auto& meanShift = seedAlignmentStatus.result->meanShift;
debug_trace("Align pairwise: after seed alignment: bandWidth={:}, meanShift={:}\n", bandWidth, meanShift);

if (bandWidth > alignmentOptions.maxIndel) {
debug_trace("Align pairwise: failed: `bandWidth > alignmentOptions.maxIndel`, where bandWidth={:}, maxIndel={:}\n",
Expand All @@ -511,12 +499,12 @@ AlignmentStatus<Letter> alignPairwise(const Sequence<Letter>& query, const Seque
.result = {},
};
}
const ForwardTrace& forwardTrace = scoreMatrix(query, ref, gapOpenClose, bandWidth, meanShift, alignmentOptions);
const ForwardTrace& forwardTrace = scoreMatrix(query, ref, gapOpenClose, bandWidth, shift, alignmentOptions);
const auto& scores = forwardTrace.scores;
const auto& paths = forwardTrace.paths;
// debug_trace("Align pairwise: after score matrix: scores={:}, paths={:}\n", scores, paths);

return backTrace<Letter>(query, ref, scores, paths, meanShift);
return backTrace<Letter>(query, ref, scores, paths, shift);
}


Expand All @@ -533,11 +521,25 @@ NucleotideAlignmentStatus alignPairwise(const NucleotideSequence& query, const N
};
}

return alignPairwise(query, ref, gapOpenClose, alignmentOptions, seedOptions, AlignPairwiseTag{});
const SeedAlignmentStatus& seedAlignmentStatus = seedAlignment(query, ref, seedOptions);
if (seedAlignmentStatus.status != Status::Success) {
return AlignmentStatus<Nucleotide>{
.status = seedAlignmentStatus.status,
.error = seedAlignmentStatus.error,
.result = {},
};
}

const auto& bandWidth = seedAlignmentStatus.result->bandWidth;
const auto& meanShift = seedAlignmentStatus.result->meanShift;
debug_trace("Align pairwise: after seed alignment: bandWidth={:}, meanShift={:}\n", bandWidth, meanShift);

return alignPairwise(query, ref, gapOpenClose, alignmentOptions, seedOptions, bandWidth, meanShift,
AlignPairwiseTag{});
}

AminoacidAlignmentStatus alignPairwise(const AminoacidSequence& query, const AminoacidSequence& ref,
const std::vector<int>& gapOpenClose, const NextalignAlignmentOptions& alignmentOptions,
const NextalignSeedOptions& seedOptions) {
return alignPairwise(query, ref, gapOpenClose, alignmentOptions, seedOptions, AlignPairwiseTag{});
const NextalignSeedOptions& seedOptions, int bandWidth, int shift) {
return alignPairwise(query, ref, gapOpenClose, alignmentOptions, seedOptions, bandWidth, shift, AlignPairwiseTag{});
}
4 changes: 2 additions & 2 deletions packages/nextalign/src/align/alignPairwise.h
Expand Up @@ -55,7 +55,7 @@ SeedAlignmentStatus seedAlignment(const Sequence<Letter>& query, const Sequence<

template<typename Letter>
ForwardTrace scoreMatrix(const Sequence<Letter>& query, const Sequence<Letter>& ref,
const std::vector<int>& gapOpenClose, int bandWidth, int meanShift,
const std::vector<int>& gapOpenClose, const int bandWidth, const int meanShift,
const NextalignAlignmentOptions& alignmentOptions);

template<typename Letter>
Expand All @@ -68,4 +68,4 @@ NucleotideAlignmentStatus alignPairwise(const NucleotideSequence& query, const N

AminoacidAlignmentStatus alignPairwise(const AminoacidSequence& query, const AminoacidSequence& ref,
const std::vector<int>& gapOpenClose, const NextalignAlignmentOptions& alignmentOptions,
const NextalignSeedOptions& seedOptions);
const NextalignSeedOptions& seedOptions, const int bandWidth, const int shift);
77 changes: 77 additions & 0 deletions packages/nextalign/src/align/alignmentParams.cpp
@@ -0,0 +1,77 @@
#include "alignmentParams.h"

#include <nextalign/nextalign.h>

#include "../utils/debug_trace.h"
#include "../utils/safe_cast.h"


GapCounts countGaps(const NucleotideSequence& seq) {
int len = safe_cast<int>(seq.size());

if (len == 0) {
return GapCounts{
.leading = 0,
.internal = 0,
.trailing = 0,
.total = 0,
};
}

if (len == 1) {
return GapCounts{
.leading = isGap(seq[0]) ? 1 : 0,
.internal = 0,
.trailing = 0,
.total = 0,
};
}

// Rewind forward until the first non-gap
int begin = 0;
while (begin < len && isGap(seq[begin])) {
++begin;
}


// Rewind backwards starting from the end, until the first non-gap
int end = len - 1;
while (end > begin && isGap(seq[end])) {
--end;
}

// Count gaps in the internal region
int totalInternalGaps = 0;
for (int i = begin; i < end; ++i) {
if (isGap(seq[i])) {
++totalInternalGaps;
}
}

int leading = begin;
int internal = totalInternalGaps;
int trailing = len - end - 1;
int total = leading + internal + trailing;

return GapCounts{
.leading = leading,
.internal = internal,
.trailing = trailing,
.total = total,
};
}

AlignmentParams calculateAaAlignmentParams(const GapCounts& queryGapCounts, const GapCounts& refGapCounts) {
constexpr int BASE_BAND_WIDTH = 3;// An arbitrary magic number to give some additional room for alignment

const int bandWidth = std::max(queryGapCounts.internal, refGapCounts.internal) / 3 + BASE_BAND_WIDTH;
const int shift = queryGapCounts.leading / 3 + bandWidth / 2;

debug_trace("Deduced alignment params: bandWidth={:}, shift={:}\n", bandWidth, shift);

return AlignmentParams{
.bandWidth = bandWidth,
.shift = shift,
};
;
}
23 changes: 23 additions & 0 deletions packages/nextalign/src/align/alignmentParams.h
@@ -0,0 +1,23 @@
#pragma once

#include <nextalign/nextalign.h>


struct GapCounts {
int leading;
int internal;
int trailing;
int total;
};


struct AlignmentParams {
int bandWidth;
int shift;
};

/** Returns number of leading, internal and trailing gaps, as well as total count */
GapCounts countGaps(const NucleotideSequence& seq);

/** Returns aminoacid alignment parameters deduced from nucleotide alignment */
AlignmentParams calculateAaAlignmentParams(const GapCounts& queryGapCounts, const GapCounts& refGapCounts);
8 changes: 5 additions & 3 deletions packages/nextalign/src/nextalign.cpp
@@ -1,6 +1,8 @@
#include <nextalign/nextalign.h>
#include <utils/concat_move.h>

#include <algorithm>

#include "align/alignPairwise.h"
#include "align/getGapOpenCloseScores.h"
#include "strip/stripInsertions.h"
Expand Down Expand Up @@ -47,6 +49,9 @@ NextalignResultInternal nextalignInternal(const NucleotideSequence& query, const
throw ErrorNonFatal(*alignmentStatus.error);
}

const auto stripped = stripInsertions(alignmentStatus.result->ref, alignmentStatus.result->query);
const auto refStripped = removeGaps(ref);

std::vector<PeptideInternal> queryPeptides;
Warnings warnings;
if (!geneMap.empty()) {
Expand All @@ -57,9 +62,6 @@ NextalignResultInternal nextalignInternal(const NucleotideSequence& query, const
concat_move(peptidesInternal.warnings.inGenes, warnings.inGenes);
}

const auto stripped = stripInsertions(alignmentStatus.result->ref, alignmentStatus.result->query);
const auto refStripped = removeGaps(ref);

return NextalignResultInternal{
.query = stripped.queryStripped,
.ref = refStripped,
Expand Down
29 changes: 27 additions & 2 deletions packages/nextalign/src/translate/translateGenes.cpp
Expand Up @@ -17,9 +17,16 @@
#include "./mapCoordinates.h"
#include "./translate.h"
#include "align/alignPairwise.h"
#include "align/alignmentParams.h"
#include "detectFrameShifts.h"
#include "removeGaps.h"

namespace {
template<typename T>
inline T copy(const T& t) {
return T(t);
}
}// namespace

void maskNucFrameShiftsInPlace(NucleotideSequence& seq,
const std::vector<InternalFrameShiftResultWithMask>& frameShifts) {
Expand Down Expand Up @@ -116,7 +123,23 @@ PeptidesInternal translateGenes( //
}

auto& refGeneSeq = *extractRefGeneStatus.result;
const auto refGapCounts = countGaps(refGeneSeq);

auto& queryGeneSeq = *extractQueryGeneStatus.result;
const auto queryGeneSize = safe_cast<int>(queryGeneSeq.size());

const auto queryGapCounts = countGaps(queryGeneSeq);
const bool queryIsAllGaps = queryGapCounts.total >= queryGeneSize;

// Handle the case when the query gene is completely missing
if (queryGeneSeq.empty() || queryIsAllGaps) {
const auto message = fmt::format(
"When processing gene \"{:s}\": The gene consists entirely from gaps. "
"Note that this gene will not be included in the results of the sequence.",
geneName);
warnings.inGenes.push_back(GeneWarning{.geneName = geneName, .message = message});
continue;
}

// Make sure subsequent gap stripping does not introduce frame shift
protectFirstCodonInPlace(refGeneSeq);
Expand All @@ -134,9 +157,11 @@ PeptidesInternal translateGenes( //
debug_trace("Translating gene '{:}'\n", geneName);
const auto queryPeptide = translate(queryGeneSeq, options.translatePastStop);


debug_trace("Aligning peptide '{:}'\n", geneName);
const auto geneAlignmentStatus =
alignPairwise(queryPeptide, refPeptide->peptide, gapOpenCloseAA, options.alignment, options.seedAa);
const AlignmentParams alignmentParams = calculateAaAlignmentParams(queryGapCounts, refGapCounts);
const auto geneAlignmentStatus = alignPairwise(queryPeptide, refPeptide->peptide, gapOpenCloseAA, options.alignment,
options.seedAa, alignmentParams.bandWidth, alignmentParams.shift);

if (geneAlignmentStatus.status != Status::Success) {
const auto message = fmt::format(
Expand Down
1 change: 1 addition & 0 deletions packages/nextalign/tests/CMakeLists.txt
Expand Up @@ -12,6 +12,7 @@ add_executable(${PROJECT_NAME}
_musl_workarounds.c
alignPairwise.test.cpp
alignPairwiseWithCodons.test.cpp
alignmentParams.test.cpp
data/sampleGeneMap.h
decode.test.cpp
extractGene.test.cpp
Expand Down