-
Notifications
You must be signed in to change notification settings - Fork 58
/
extractGene.cpp
101 lines (81 loc) · 3.3 KB
/
extractGene.cpp
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
#include "extractGene.h"
#include <frozen/string.h>
#include <nextalign/nextalign.h>
#include <vector>
#include "../align/alignPairwise.h"
#include "../utils/safe_cast.h"
#include "./removeGaps.h"
NucleotideSequenceView extractGeneRef(const NucleotideSequenceView& ref, const Gene& gene) {
precondition_less(gene.length, ref.size());
precondition_less_equal(gene.length, ref.size());
return ref.substr(gene.start, gene.length);
}
/**
* Replaces first or second gap in a not-all-gap codon with N
*/
template<typename SpanIterator>
void protectCodonInPlace(SpanIterator it) {
if (it[0] == Nucleotide::GAP) {
it[0] = Nucleotide::N;
if (it[1] == Nucleotide::GAP) {
it[1] = Nucleotide::N;
precondition_not_equal(it[2], Nucleotide::GAP);// Should last position in codon should not be a gap
}
}
}
void stripGeneInPlace(NucleotideSequence& seq) {
precondition_divisible_by(seq.size(), 3);
const auto& length = safe_cast<int>(seq.size());
NucleotideSequenceSpan seqSpan = seq;
// Find the first non-GAP nucleotide and replace GAPs in the corresponding codon with Ns, so that it's not getting stripped
for (int i = 0; i < length; ++i) {
if (seqSpan[i] != Nucleotide::GAP) {
const auto codonBegin = i - i % 3;
invariant_greater_equal(codonBegin, 0);
invariant_less(codonBegin + 2, length);
const auto codon = seqSpan.subspan(codonBegin, 3);
protectCodonInPlace(codon.begin());
break;
}
}
// Find the last non-GAP nucleotide and replace GAPs in the corresponding codon with Ns, so that it's not getting stripped
for (int i = length - 1; i >= 0; --i) {
if (seqSpan[i] != Nucleotide::GAP) {
const auto codonBegin = i - i % 3;
invariant_greater_equal(codonBegin, 0);
invariant_less(codonBegin + 2, length);
const auto codon = seqSpan.subspan(codonBegin, 3);
protectCodonInPlace(codon.rbegin());// Note: reverse iterator - going from end to begin
break;
}
}
// Remove all GAP characters from everywhere (Note: including the full gap-only codons at the edges)
removeGapsInPlace(seq);
}
/**
* Extracts gene from the query sequence according to coordinate map relative to the reference sequence
*/
NucleotideSequence extractGeneQuery(
const NucleotideSequenceView& query, const Gene& gene, const std::vector<int>& coordMap) {
precondition_less(gene.start, coordMap.size());
precondition_less(gene.end, coordMap.size());
// Transform gene coordinates according to coordinate map
const auto start = coordMap[gene.start];
const auto end = coordMap[gene.end];// TODO: `gene.end` -1 or not?
const auto length = end - start;
// Start and end should be within bounds
invariant_less(start, query.size());
invariant_less(end, query.size());
auto result = NucleotideSequence(query.substr(start, length));
const auto resultLengthPreStrip = safe_cast<int>(result.size());
if (resultLengthPreStrip % 3 != 0) {
throw ErrorExtractGeneLengthInvalid(gene.geneName, resultLengthPreStrip);
}
stripGeneInPlace(result);
const auto resultLength = safe_cast<int>(result.size());
if (resultLength % 3 != 0) {
throw ErrorExtractGeneLengthInvalid(gene.geneName, resultLength);
}
invariant_less_equal(result.size(), query.size());// Length of the gene should not exceed the length of the sequence
return result;
}