Skip to content

Commit

Permalink
Browse files Browse the repository at this point in the history
Copied cpp code from other repo
  • Loading branch information
Albert Kim committed Apr 5, 2012
0 parents commit f65202b
Show file tree
Hide file tree
Showing 26 changed files with 2,640 additions and 0 deletions.
31 changes: 31 additions & 0 deletions Makefile
@@ -0,0 +1,31 @@
PROG_NAME = mermaid

CXX = mpic++
CXXFLAGS = -c -g -Wall -O3 -fshort-enums
LDFLAGS =
ifeq ($(shell uname), Darwin)
LIBS = -lboost_mpi -lboost_serialization-mt -lboost_filesystem-mt -lboost_system-mt
else
LIBS = -lboost_mpi -lboost_serialization -lboost_filesystem -lboost_system
endif
DEPFLAGS = -MM

SRCS := $(wildcard *.cpp)
OBJS := $(SRCS:.cpp=.o)
DEPS := $(OBJS:.o=.d)

all : $(PROG_NAME)

$(PROG_NAME) : $(OBJS)
$(CXX) $(LDFLAGS) -o $@ $^ $(LIBS)

%.d : %.cpp
@$(CXX) $(DEPFLAGS) -o $@ $^
# @echo -e '\t$$(CXX) $$(CXXFLAGS) -o $$@ $$<' >> $@

-include $(DEPS)

.PHONY : clean

clean :
rm -rf *.o *.d $(PROG_NAME)
6 changes: 6 additions & 0 deletions bam_reader.cpp
@@ -0,0 +1,6 @@
#include "bam_reader.h"

BAMReader::BAMReader(char* fname)
: Reader(fname)
{
}
11 changes: 11 additions & 0 deletions bam_reader.h
@@ -0,0 +1,11 @@
#ifndef _BAM_READER_H_
#define _BAM_READER_H_

#include "reader.h"

class BAMReader : public Reader {
public:
BAMReader(char* fname);
};

#endif /* _BAM_READER_H_ */
66 changes: 66 additions & 0 deletions base.cpp
@@ -0,0 +1,66 @@
#include "base.h"
#include "utils.h"

namespace BASE {
base char2base(char c)
{
switch (c) {
case 'A': return A;
case 'C': return C;
case 'G': return G;
case 'T': return T;
case 'N': return N;
default: panic("Invalid %s: %u\n", __func__, c);
}
}

char base2char(base b)
{
switch(b) {
case A: return 'A';
case C: return 'C';
case G: return 'G';
case T: return 'T';
case N: return 'N';
default: panic("Invalid %s: %u\n", __func__, b);
}
}

base inv_base(base b)
{
switch (b) {
case A: return T;
case C: return G;
case G: return C;
case T: return A;
case N: return N;
default: panic("Invalid %s: %u\n", __func__, b);
}
}

char inv_base(char c)
{
switch (c) {
case 'A': return 'T';
case 'C': return 'G';
case 'G': return 'C';
case 'T': return 'A';
case 'N': return 'N';
default: panic("Invalid %s: %u\n", __func__, c);
}
}

bool validate_base(base b)
{
switch (b) {
case A:
case C:
case G:
case T:
case N:
return true;
default:
return false;
}
}
}
19 changes: 19 additions & 0 deletions base.h
@@ -0,0 +1,19 @@
#ifndef _BASE_H_
#define _BASE_H_

#include <boost/cstdint.hpp>

#include "exceptions.h"

namespace BASE {
enum base { A = 0, C, G, T, N };
const uint8_t NUM_BASES = 4;

base char2base(char c);
char base2char(base b);
base inv_base(base b);
char inv_base(char c);
bool validate_base(base b);
}

#endif /* _BASE_H_ */
41 changes: 41 additions & 0 deletions bloom_filter.cpp
@@ -0,0 +1,41 @@
#include <cstdlib>
#include <climits>
#include <cmath>

#include <boost/cstdint.hpp>

#include "bloom_filter.h"

#define set_bit(a, i) ((a)[(i)/CHAR_BIT] |= (1 << ((i) % CHAR_BIT)))
#define get_bit(a, i) ((a)[(i)/CHAR_BIT] & (1 << ((i) % CHAR_BIT)))

BloomFilter::BloomFilter(size_t capacity, float error_rate, size_t seed, bloom_filter_hash_func_t hash)
: capacity(capacity), count(0), seed(seed), hash(hash)
{
nfuncs = (size_t) ceil(log2(1 / error_rate));
slice_size = (size_t) ceil(capacity * log(2));
/* FIXME - Using CHAR_BIT may tie us to using only gcc. */
array = (uint8_t*) calloc((nfuncs * slice_size + CHAR_BIT - 1) / CHAR_BIT, sizeof(uint8_t));
}

BloomFilter::~BloomFilter()
{
free(array);
}

void BloomFilter::add(void* key)
{
for (size_t i = 0; i < nfuncs; i++) {
set_bit(array, hash(i + seed, key) % slice_size + i * slice_size);
}
count++;
}

bool BloomFilter::check(void* key)
{
for (size_t i = 0; i < nfuncs; i++) {
if (!get_bit(array, hash(i + seed, key) % slice_size + i * slice_size))
return false;
}
return true;
}
31 changes: 31 additions & 0 deletions bloom_filter.h
@@ -0,0 +1,31 @@
#ifndef _BLOOM_FILTER_H_
#define _BLOOM_FILTER_H_

#include <boost/cstdint.hpp>

typedef size_t (*bloom_filter_hash_func_t)(size_t seed, void* key);

class BloomFilter {
public:
BloomFilter(size_t capacity, float error_rate, size_t seed, bloom_filter_hash_func_t hash);
~BloomFilter();

/* Adds the key to the filter. */
void add(void* key);
/* Returns true if key was found in the filter; false otherwise. */
bool check(void* key);

bool full(void) { return count >= capacity; }

protected:
uint8_t* array;
size_t capacity;
size_t nfuncs;
size_t slice_size; /* Size of each partition. */
size_t count;
size_t seed;
bloom_filter_hash_func_t hash;
};


#endif /* _BLOOM_FILTER_H_ */
13 changes: 13 additions & 0 deletions config.h
@@ -0,0 +1,13 @@
#ifndef _CONFIG_H_
#define _CONFIG_H_

typedef uint32_t k_t;

const uint8_t Q_MIN = 19; /* Minimum quality threshold. */
const uint8_t D_MIN = 10; /* Minimum number of high-quality extensions needed. */

const k_t K = 41;

const unsigned int PREFIX_ALLOC_LENGTH = 5;

#endif /* _CONFIG_H_ */
9 changes: 9 additions & 0 deletions exceptions.h
@@ -0,0 +1,9 @@
#ifndef _EXCEPTIONS_H_
#define _EXCEPTIONS_H_

enum {
INVALID_BASE,
KMER_OUT_OF_BOUNDS
};

#endif /* _EXCEPTIONS_H_ */
149 changes: 149 additions & 0 deletions fastq_reader.cpp
@@ -0,0 +1,149 @@
#include <cstdlib>
#include <vector>
#include <string>
#include <iostream>
#include <cstring>

#include <boost/filesystem.hpp>
#include <boost/filesystem/fstream.hpp>

#include "fastq_reader.h"
#include "kmer.h"

using namespace std;
namespace fs = boost::filesystem;

FastQReader::FastQReader(vector<string>& fnames, k_t k)
: k(k), paths(), curr_path(), curr_file(), so_far(), max_byte(~1), read_col(0)
{
for (vector<string>::iterator it = fnames.begin(); it < fnames.end(); it++) {
paths.push_back(fs::path(*it));
}
}

uintmax_t FastQReader::total_bytes(void)
{
uintmax_t total = 0;

for (vector<fs::path>::iterator it = paths.begin(); it < paths.end(); it++) {
total += fs::file_size(*it);
}

return total;
}

void FastQReader::set_max_byte(uintmax_t max_byte)
{
this->max_byte = max_byte;
}

void FastQReader::seek(uintmax_t offset)
{
for (curr_path = paths.begin(); curr_path < paths.end(); curr_path++) {
if ((so_far + fs::file_size(*curr_path)) >= offset) {
curr_file.close();
curr_file.open(*curr_path);
break;
}
so_far += fs::file_size(*curr_path);
}

/**
* FIXME - We assume '@' will only appear at the beginning of a read. Check
* to see that quality scores don't clash.
*/
curr_file.seekg(offset - so_far, ios::beg);
seek_to_next_read();
}

void FastQReader::seek_to_next_read()
{
curr_file.ignore(MAX_LINE_LEN, '@');
if (curr_file.eof()) {
next_file();
} else {
curr_file.putback('@');
}
}

uintmax_t FastQReader::tell(void)
{
return so_far + curr_file.tellg();
}

bool FastQReader::next_file()
{
curr_file.seekg(0, ios::end);
so_far += curr_file.tellg();
curr_path++;
curr_file.close();
if (curr_path == paths.end())
{
return false;
}
curr_file.open(*curr_path);
return true;
}

bool FastQReader::read_next(qkmer_t* qkmer)
{
if (curr_path == paths.end()) {
return false;
} else if (!read_col && tell() >= max_byte) {
return false;
}

if (read_col == 0) {
char read_buf[MAX_LINE_LEN];
char quals_buf[MAX_LINE_LEN];

/* Ignore the header line. */
curr_file.ignore(MAX_LINE_LEN, '\n');
/* Get the actual read. */
curr_file.getline(read_buf, MAX_LINE_LEN);
/* Ignore +. */
curr_file.ignore(MAX_LINE_LEN, '\n');
/* Getthe quality scores. */
curr_file.getline(quals_buf, MAX_LINE_LEN);
read_len = strlen(read_buf);

str2kmer(read, read_buf, read_len);
for (size_t i = 0; i < read_len; i++) {
quals[i] = quals_buf[i] - ILLUMINA_QUAL_OFFSET;
}
}

// WARNING: This memset may be needed.
//memset(qkmer, 0, qkmer_size(k + 2));
base b;
if (read_col == 0) {
for_base_in_kmer_from(b, read, k + 1, read_col) {
set_base(qkmer->kmer, 1 + b_i_, b);
} end_for;
set_base(qkmer->kmer, 0, BASE::N);
qkmer->lqual = 0;
qkmer->rqual = quals[read_col + k];
} else if (read_col <= read_len - (k + 1)) {
for_base_in_kmer_from(b, read, k + 2, read_col - 1) {
set_base(qkmer->kmer, b_i_, b);
} end_for;
qkmer->lqual = quals[read_col - 1];
qkmer->rqual = quals[read_col + k];
} else { /* read_col == read_len - k */
for_base_in_kmer_from(b, read, k + 1, read_col - 1) {
set_base(qkmer->kmer, b_i_, b);
} end_for;
set_base(qkmer->kmer, k + 1, BASE::N);
qkmer->lqual = quals[read_col - 1];
qkmer->rqual = 0;
}

read_col++;

if (read_col > read_len - k) {
seek_to_next_read();
read_col = 0;
}

return true;
}

0 comments on commit f65202b

Please sign in to comment.