Skip to content

Commit

Permalink
Fix/finish exact kmer lookups; add basic analysis of kmers in reads.
Browse files Browse the repository at this point in the history
  • Loading branch information
brianwalenz committed Jul 23, 2018
1 parent fca1102 commit 33df82d
Show file tree
Hide file tree
Showing 6 changed files with 295 additions and 27 deletions.
1 change: 1 addition & 0 deletions src/main.mk
Original file line number Diff line number Diff line change
Expand Up @@ -185,6 +185,7 @@ SUBMAKEFILES := stores/dumpBlob.mk \
meryl-san/simple-dump.mk \
\
meryl/meryl.mk \
meryl/findSeedThreshold.mk \
\
sequence/sequence.mk \
\
Expand Down
157 changes: 157 additions & 0 deletions src/meryl/findSeedThreshold.C
Original file line number Diff line number Diff line change
@@ -0,0 +1,157 @@

/******************************************************************************
*
* This file is part of canu, a software program that assembles whole-genome
* sequencing reads into contigs.
*
* This software is based on:
* 'Celera Assembler' (http://wgs-assembler.sourceforge.net)
* the 'kmer package' (http://kmer.sourceforge.net)
* both originally distributed by Applera Corporation under the GNU General
* Public License, version 2.
*
* Canu branched from Celera Assembler at its revision 4587.
* Canu branched from the kmer project at its revision 1994.
*
* Modifications by:
*
* Brian P. Walenz beginning on 2018-JUL-23
* are a 'United States Government Work', and
* are released in the public domain
*
* File 'README.licenses' in the root directory of this distribution contains
* full conditions and disclaimers for each license.
*/

#include "sqStore.H"

#include "files.H"
#include "strings.H"
#include "kmers.H"

int
main(int argc, char **argv) {
char *seqStorePath = NULL;
char *merylPath = NULL;

uint32 bgnID = 1;
uint32 endID = UINT32_MAX;

argc = AS_configure(argc, argv);

vector<char *> err;
int arg = 1;
while (arg < argc) {
if (strcmp(argv[arg], "-S") == 0) {
seqStorePath = argv[++arg];
}

else if (strcmp(argv[arg], "-M") == 0) {
merylPath = argv[++arg];
}

else if (strcmp(argv[arg], "-r") == 0) {
decodeRange(argv[++arg], bgnID, endID);
}

else {
}

arg++;
}

if (seqStorePath == NULL) err.push_back("No sequence store (-S option) supplied.\n");
if (merylPath == NULL) err.push_back("No kmer data (-M option) supplied.\n");

if (err.size() > 0) {
fprintf(stderr, "usage: %s -S seqPath -M merylData ...\n", argv[0]);
fprintf(stderr, "\n");
exit(1);
}


sqStore *seqStore = sqStore::sqStore_open(seqStorePath);

bgnID = max(bgnID, (uint32)1);
endID = min(endID, seqStore->sqStore_getNumReads() + 1);

sqReadData *readData = new sqReadData;

kmerCountFileReader *merylReader = new kmerCountFileReader(merylPath, false, true);
kmerCountExactLookup *merylLookup = new kmerCountExactLookup(merylReader);

fprintf(stdout, " ----kmers---- -------value----------\n");
fprintf(stdout, " ID found tested min max average\n");
fprintf(stdout, "----- ------ ------ ------ ------ --------\n");


for (uint32 ii=bgnID; ii<endID; ii++) {
sqRead *read = seqStore->sqStore_getRead(ii);

seqStore->sqStore_loadReadData(read, readData);

char *seq = readData->sqReadData_getSequence();
uint32 seqLen = read->sqRead_sequenceLength();

uint32 kmersTested = 0;
uint32 kmersFound = 0;

kmer fmer;
kmer rmer;

uint32 kmerLoad = 0;
uint32 kmerValid = fmer.merSize() - 1;

uint64 average = 0;
uint32 minV = UINT32_MAX;
uint32 maxV = 0;

for (uint32 ss=0; ss<seqLen; ss++) {
if ((seq[ss] != 'A') && (seq[ss] != 'a') && // If not valid DNA, don't
(seq[ss] != 'C') && (seq[ss] != 'c') && // make a kmer, and reset
(seq[ss] != 'G') && (seq[ss] != 'g') && // the count until the next
(seq[ss] != 'T') && (seq[ss] != 't')) { // valid kmer is available.
kmerLoad = 0;
continue;
}

fmer.addR(seq[ss]);
rmer.addL(seq[ss]);

if (kmerLoad < kmerValid) { // If not a full kmer, increase the length we've
kmerLoad++; // got loaded, and keep going.
continue;
}

uint32 value;

if (fmer < rmer)
value = merylLookup->value(fmer);
else
value = merylLookup->value(rmer);

kmersTested++;

if (value) {
minV = min(minV, value);
maxV = max(maxV, value);

kmersFound++;
}

average += value;
}

if (kmersTested > 0)
fprintf(stdout, "%5u %6u %6u %6u %6u %8.2f\n",
ii, kmersFound, kmersTested, minV, maxV, (double)average / kmersTested);
}


delete merylLookup;
delete merylReader;

seqStore->sqStore_close();

exit(0);
}
19 changes: 19 additions & 0 deletions src/meryl/findSeedThreshold.mk
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
# If 'make' isn't run from the root directory, we need to set these to
# point to the upper level build directory.
ifeq "$(strip ${BUILD_DIR})" ""
BUILD_DIR := ../$(OSTYPE)-$(MACHINETYPE)/obj
endif
ifeq "$(strip ${TARGET_DIR})" ""
TARGET_DIR := ../$(OSTYPE)-$(MACHINETYPE)
endif

TARGET := findSeedThreshold
SOURCES := findSeedThreshold.C

SRC_INCDIRS := . .. ../utility ../stores

TGT_LDFLAGS := -L${TARGET_DIR}/lib
TGT_LDLIBS := -lcanu
TGT_PREREQS := libcanu.a

SUBMAKEFILES :=
2 changes: 1 addition & 1 deletion src/meryl/lookup.C
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ main(int argc, char **argv) {

tested++;

if (ll->exists(k))
if (ll->value(k))
found++;

if ((tested % 100000) == 0)
Expand Down
59 changes: 45 additions & 14 deletions src/utility/kmers-exact.C
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,9 @@ bitsToGB(uint64 bits) {
// 2^p * log2(nDistinct) + (K - p) * nDistinct
//

kmerCountExactLookup::kmerCountExactLookup(kmerCountFileReader *input) {
kmerCountExactLookup::kmerCountExactLookup(kmerCountFileReader *input,
uint32 minValue,
uint32 maxValue) {

_Kbits = kmer::merSize() * 2;

Expand All @@ -48,11 +50,32 @@ kmerCountExactLookup::kmerCountExactLookup(kmerCountFileReader *input) {
_suffixBits = 0; // Width of an entry in the suffix table.
_valueBits = 0; // (also in the suffix table)

if (minValue == 0) // Silently adjust to the legal minValue.
minValue = 1;

_valueOffset = minValue - 1; // "1" stored in the data is really "minValue" to the user.

_nPrefix = 0; // Number of entries in pointer table.
_nSuffix = input->stats()->numDistinct(); // Number of entries in suffix dable.

_prePtrBits = logBaseTwo64(_nSuffix); // Width of an entry in the prefix table.

// If maxValue isn't set, ask the input what the largest count is.
// Then set the valueBits needed to hold those values.

if (maxValue == UINT32_MAX) {
kmerCountStatistics *hist = input->stats();

maxValue = hist->numFrequencies() - 1;

while ((maxValue > 0) && (hist->numKmersAtFrequency(maxValue) == 0))
maxValue--;

fprintf(stderr, "MAX VALUE %u\n", maxValue);
}

if (maxValue > minValue)
_valueBits = logBaseTwo32(maxValue + 1 - minValue);

// First, find the prefixBits that results in the smallest allocated memory size.

Expand Down Expand Up @@ -202,16 +225,6 @@ kmerCountExactLookup::kmerCountExactLookup(kmerCountFileReader *input) {
uint64 sdata = 0;
uint64 prefix = 0;

#if 0
assert(bb == block->prefix());

sdata = ff;
sdata <<= input->numBlocksBits();
sdata |= bb;
sdata <<= input->suffixSize();
sdata |= block->suffixes()[ss];
#endif

sdata = block->prefix();
sdata <<= input->suffixSize();
sdata |= block->suffixes()[ss];
Expand All @@ -220,11 +233,29 @@ kmerCountExactLookup::kmerCountExactLookup(kmerCountFileReader *input) {

prefix = sdata >> _suffixBits;

// Add in any extra data to be stored here.
// Add in any extra data to be stored here. Unfortunately,
// we must load the values outside the minValue and maxValue range, otherwise
// we end up with holes in the table. This also means we need to store kmers
// with value 0, which are treated as if they don't exist later.

if (_valueBits > 0) {
uint32 value = block->counts()[ss];

if ((value < minValue) ||
(maxValue < value))
value = 0;

if (value != 0)
value -= _valueOffset;

if (value > maxValue + 1 - minValue) {
fprintf(stderr, "minValue %u maxValue %u value %u bits %u\n",
minValue, maxValue, value, _valueBits);
}
assert(value <= uint64MASK(_valueBits));

sdata <<= _valueBits;
sdata |= 0;
sdata |= value;
}

// And store it! We let set() mask off the top end of the kmer that we shouldn't
Expand Down Expand Up @@ -278,7 +309,7 @@ kmerCountExactLookup::kmerCountExactLookup(kmerCountFileReader *input) {
_suffixStart[0] = 0;
_suffixStart[_nPrefix] = _nSuffix;

#if 1
#if 0
FILE *F = fopen("suffix-start-table", "w");
for (uint64 pp=0; pp<_nPrefix; pp++)
fprintf(F, "%8lu %lu\n", pp, _suffixStart[pp]);
Expand Down
Loading

0 comments on commit 33df82d

Please sign in to comment.