Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Browse files
Browse the repository at this point in the history
Copied cpp code from other repo
- Loading branch information
Albert Kim
committed
Apr 5, 2012
0 parents
commit f65202b
Showing
26 changed files
with
2,640 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,31 @@ | ||
PROG_NAME = mermaid | ||
|
||
CXX = mpic++ | ||
CXXFLAGS = -c -g -Wall -O3 -fshort-enums | ||
LDFLAGS = | ||
ifeq ($(shell uname), Darwin) | ||
LIBS = -lboost_mpi -lboost_serialization-mt -lboost_filesystem-mt -lboost_system-mt | ||
else | ||
LIBS = -lboost_mpi -lboost_serialization -lboost_filesystem -lboost_system | ||
endif | ||
DEPFLAGS = -MM | ||
|
||
SRCS := $(wildcard *.cpp) | ||
OBJS := $(SRCS:.cpp=.o) | ||
DEPS := $(OBJS:.o=.d) | ||
|
||
all : $(PROG_NAME) | ||
|
||
$(PROG_NAME) : $(OBJS) | ||
$(CXX) $(LDFLAGS) -o $@ $^ $(LIBS) | ||
|
||
%.d : %.cpp | ||
@$(CXX) $(DEPFLAGS) -o $@ $^ | ||
# @echo -e '\t$$(CXX) $$(CXXFLAGS) -o $$@ $$<' >> $@ | ||
|
||
-include $(DEPS) | ||
|
||
.PHONY : clean | ||
|
||
clean : | ||
rm -rf *.o *.d $(PROG_NAME) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,6 @@ | ||
#include "bam_reader.h" | ||
|
||
BAMReader::BAMReader(char* fname) | ||
: Reader(fname) | ||
{ | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,11 @@ | ||
#ifndef _BAM_READER_H_ | ||
#define _BAM_READER_H_ | ||
|
||
#include "reader.h" | ||
|
||
class BAMReader : public Reader { | ||
public: | ||
BAMReader(char* fname); | ||
}; | ||
|
||
#endif /* _BAM_READER_H_ */ |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,66 @@ | ||
#include "base.h" | ||
#include "utils.h" | ||
|
||
namespace BASE { | ||
base char2base(char c) | ||
{ | ||
switch (c) { | ||
case 'A': return A; | ||
case 'C': return C; | ||
case 'G': return G; | ||
case 'T': return T; | ||
case 'N': return N; | ||
default: panic("Invalid %s: %u\n", __func__, c); | ||
} | ||
} | ||
|
||
char base2char(base b) | ||
{ | ||
switch(b) { | ||
case A: return 'A'; | ||
case C: return 'C'; | ||
case G: return 'G'; | ||
case T: return 'T'; | ||
case N: return 'N'; | ||
default: panic("Invalid %s: %u\n", __func__, b); | ||
} | ||
} | ||
|
||
base inv_base(base b) | ||
{ | ||
switch (b) { | ||
case A: return T; | ||
case C: return G; | ||
case G: return C; | ||
case T: return A; | ||
case N: return N; | ||
default: panic("Invalid %s: %u\n", __func__, b); | ||
} | ||
} | ||
|
||
char inv_base(char c) | ||
{ | ||
switch (c) { | ||
case 'A': return 'T'; | ||
case 'C': return 'G'; | ||
case 'G': return 'C'; | ||
case 'T': return 'A'; | ||
case 'N': return 'N'; | ||
default: panic("Invalid %s: %u\n", __func__, c); | ||
} | ||
} | ||
|
||
bool validate_base(base b) | ||
{ | ||
switch (b) { | ||
case A: | ||
case C: | ||
case G: | ||
case T: | ||
case N: | ||
return true; | ||
default: | ||
return false; | ||
} | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,19 @@ | ||
#ifndef _BASE_H_ | ||
#define _BASE_H_ | ||
|
||
#include <boost/cstdint.hpp> | ||
|
||
#include "exceptions.h" | ||
|
||
namespace BASE { | ||
enum base { A = 0, C, G, T, N }; | ||
const uint8_t NUM_BASES = 4; | ||
|
||
base char2base(char c); | ||
char base2char(base b); | ||
base inv_base(base b); | ||
char inv_base(char c); | ||
bool validate_base(base b); | ||
} | ||
|
||
#endif /* _BASE_H_ */ |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,41 @@ | ||
#include <cstdlib> | ||
#include <climits> | ||
#include <cmath> | ||
|
||
#include <boost/cstdint.hpp> | ||
|
||
#include "bloom_filter.h" | ||
|
||
#define set_bit(a, i) ((a)[(i)/CHAR_BIT] |= (1 << ((i) % CHAR_BIT))) | ||
#define get_bit(a, i) ((a)[(i)/CHAR_BIT] & (1 << ((i) % CHAR_BIT))) | ||
|
||
BloomFilter::BloomFilter(size_t capacity, float error_rate, size_t seed, bloom_filter_hash_func_t hash) | ||
: capacity(capacity), count(0), seed(seed), hash(hash) | ||
{ | ||
nfuncs = (size_t) ceil(log2(1 / error_rate)); | ||
slice_size = (size_t) ceil(capacity * log(2)); | ||
/* FIXME - Using CHAR_BIT may tie us to using only gcc. */ | ||
array = (uint8_t*) calloc((nfuncs * slice_size + CHAR_BIT - 1) / CHAR_BIT, sizeof(uint8_t)); | ||
} | ||
|
||
BloomFilter::~BloomFilter() | ||
{ | ||
free(array); | ||
} | ||
|
||
void BloomFilter::add(void* key) | ||
{ | ||
for (size_t i = 0; i < nfuncs; i++) { | ||
set_bit(array, hash(i + seed, key) % slice_size + i * slice_size); | ||
} | ||
count++; | ||
} | ||
|
||
bool BloomFilter::check(void* key) | ||
{ | ||
for (size_t i = 0; i < nfuncs; i++) { | ||
if (!get_bit(array, hash(i + seed, key) % slice_size + i * slice_size)) | ||
return false; | ||
} | ||
return true; | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,31 @@ | ||
#ifndef _BLOOM_FILTER_H_ | ||
#define _BLOOM_FILTER_H_ | ||
|
||
#include <boost/cstdint.hpp> | ||
|
||
typedef size_t (*bloom_filter_hash_func_t)(size_t seed, void* key); | ||
|
||
class BloomFilter { | ||
public: | ||
BloomFilter(size_t capacity, float error_rate, size_t seed, bloom_filter_hash_func_t hash); | ||
~BloomFilter(); | ||
|
||
/* Adds the key to the filter. */ | ||
void add(void* key); | ||
/* Returns true if key was found in the filter; false otherwise. */ | ||
bool check(void* key); | ||
|
||
bool full(void) { return count >= capacity; } | ||
|
||
protected: | ||
uint8_t* array; | ||
size_t capacity; | ||
size_t nfuncs; | ||
size_t slice_size; /* Size of each partition. */ | ||
size_t count; | ||
size_t seed; | ||
bloom_filter_hash_func_t hash; | ||
}; | ||
|
||
|
||
#endif /* _BLOOM_FILTER_H_ */ |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,13 @@ | ||
#ifndef _CONFIG_H_ | ||
#define _CONFIG_H_ | ||
|
||
typedef uint32_t k_t; | ||
|
||
const uint8_t Q_MIN = 19; /* Minimum quality threshold. */ | ||
const uint8_t D_MIN = 10; /* Minimum number of high-quality extensions needed. */ | ||
|
||
const k_t K = 41; | ||
|
||
const unsigned int PREFIX_ALLOC_LENGTH = 5; | ||
|
||
#endif /* _CONFIG_H_ */ |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,9 @@ | ||
#ifndef _EXCEPTIONS_H_ | ||
#define _EXCEPTIONS_H_ | ||
|
||
enum { | ||
INVALID_BASE, | ||
KMER_OUT_OF_BOUNDS | ||
}; | ||
|
||
#endif /* _EXCEPTIONS_H_ */ |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,149 @@ | ||
#include <cstdlib> | ||
#include <vector> | ||
#include <string> | ||
#include <iostream> | ||
#include <cstring> | ||
|
||
#include <boost/filesystem.hpp> | ||
#include <boost/filesystem/fstream.hpp> | ||
|
||
#include "fastq_reader.h" | ||
#include "kmer.h" | ||
|
||
using namespace std; | ||
namespace fs = boost::filesystem; | ||
|
||
FastQReader::FastQReader(vector<string>& fnames, k_t k) | ||
: k(k), paths(), curr_path(), curr_file(), so_far(), max_byte(~1), read_col(0) | ||
{ | ||
for (vector<string>::iterator it = fnames.begin(); it < fnames.end(); it++) { | ||
paths.push_back(fs::path(*it)); | ||
} | ||
} | ||
|
||
uintmax_t FastQReader::total_bytes(void) | ||
{ | ||
uintmax_t total = 0; | ||
|
||
for (vector<fs::path>::iterator it = paths.begin(); it < paths.end(); it++) { | ||
total += fs::file_size(*it); | ||
} | ||
|
||
return total; | ||
} | ||
|
||
void FastQReader::set_max_byte(uintmax_t max_byte) | ||
{ | ||
this->max_byte = max_byte; | ||
} | ||
|
||
void FastQReader::seek(uintmax_t offset) | ||
{ | ||
for (curr_path = paths.begin(); curr_path < paths.end(); curr_path++) { | ||
if ((so_far + fs::file_size(*curr_path)) >= offset) { | ||
curr_file.close(); | ||
curr_file.open(*curr_path); | ||
break; | ||
} | ||
so_far += fs::file_size(*curr_path); | ||
} | ||
|
||
/** | ||
* FIXME - We assume '@' will only appear at the beginning of a read. Check | ||
* to see that quality scores don't clash. | ||
*/ | ||
curr_file.seekg(offset - so_far, ios::beg); | ||
seek_to_next_read(); | ||
} | ||
|
||
void FastQReader::seek_to_next_read() | ||
{ | ||
curr_file.ignore(MAX_LINE_LEN, '@'); | ||
if (curr_file.eof()) { | ||
next_file(); | ||
} else { | ||
curr_file.putback('@'); | ||
} | ||
} | ||
|
||
uintmax_t FastQReader::tell(void) | ||
{ | ||
return so_far + curr_file.tellg(); | ||
} | ||
|
||
bool FastQReader::next_file() | ||
{ | ||
curr_file.seekg(0, ios::end); | ||
so_far += curr_file.tellg(); | ||
curr_path++; | ||
curr_file.close(); | ||
if (curr_path == paths.end()) | ||
{ | ||
return false; | ||
} | ||
curr_file.open(*curr_path); | ||
return true; | ||
} | ||
|
||
bool FastQReader::read_next(qkmer_t* qkmer) | ||
{ | ||
if (curr_path == paths.end()) { | ||
return false; | ||
} else if (!read_col && tell() >= max_byte) { | ||
return false; | ||
} | ||
|
||
if (read_col == 0) { | ||
char read_buf[MAX_LINE_LEN]; | ||
char quals_buf[MAX_LINE_LEN]; | ||
|
||
/* Ignore the header line. */ | ||
curr_file.ignore(MAX_LINE_LEN, '\n'); | ||
/* Get the actual read. */ | ||
curr_file.getline(read_buf, MAX_LINE_LEN); | ||
/* Ignore +. */ | ||
curr_file.ignore(MAX_LINE_LEN, '\n'); | ||
/* Getthe quality scores. */ | ||
curr_file.getline(quals_buf, MAX_LINE_LEN); | ||
read_len = strlen(read_buf); | ||
|
||
str2kmer(read, read_buf, read_len); | ||
for (size_t i = 0; i < read_len; i++) { | ||
quals[i] = quals_buf[i] - ILLUMINA_QUAL_OFFSET; | ||
} | ||
} | ||
|
||
// WARNING: This memset may be needed. | ||
//memset(qkmer, 0, qkmer_size(k + 2)); | ||
base b; | ||
if (read_col == 0) { | ||
for_base_in_kmer_from(b, read, k + 1, read_col) { | ||
set_base(qkmer->kmer, 1 + b_i_, b); | ||
} end_for; | ||
set_base(qkmer->kmer, 0, BASE::N); | ||
qkmer->lqual = 0; | ||
qkmer->rqual = quals[read_col + k]; | ||
} else if (read_col <= read_len - (k + 1)) { | ||
for_base_in_kmer_from(b, read, k + 2, read_col - 1) { | ||
set_base(qkmer->kmer, b_i_, b); | ||
} end_for; | ||
qkmer->lqual = quals[read_col - 1]; | ||
qkmer->rqual = quals[read_col + k]; | ||
} else { /* read_col == read_len - k */ | ||
for_base_in_kmer_from(b, read, k + 1, read_col - 1) { | ||
set_base(qkmer->kmer, b_i_, b); | ||
} end_for; | ||
set_base(qkmer->kmer, k + 1, BASE::N); | ||
qkmer->lqual = quals[read_col - 1]; | ||
qkmer->rqual = 0; | ||
} | ||
|
||
read_col++; | ||
|
||
if (read_col > read_len - k) { | ||
seek_to_next_read(); | ||
read_col = 0; | ||
} | ||
|
||
return true; | ||
} |
Oops, something went wrong.