Skip to content

Commit

Permalink
added bed an fasta parsing
Browse files Browse the repository at this point in the history
  • Loading branch information
mhuttner committed Nov 23, 2014
1 parent defceaf commit a23c2fd
Show file tree
Hide file tree
Showing 24 changed files with 396 additions and 252 deletions.
1 change: 1 addition & 0 deletions .gitignore
Expand Up @@ -23,6 +23,7 @@ config.log
config.status

miRA
miRAtest
check_cluster
check_parse_sam
src/.deps/
Expand Down
62 changes: 61 additions & 1 deletion LICENCE → LICENSE
@@ -1,4 +1,4 @@
GNU GENERAL PUBLIC LICENSE
GNU GENERAL PUBLIC LICENSE
Version 2, June 1991

Copyright (C) 1989, 1991 Free Software Foundation, Inc., <http://fsf.org/>
Expand Down Expand Up @@ -278,3 +278,63 @@ PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE
POSSIBILITY OF SUCH DAMAGES.

END OF TERMS AND CONDITIONS

How to Apply These Terms to Your New Programs

If you develop a new program, and you want it to be of the greatest
possible use to the public, the best way to achieve this is to make it
free software which everyone can redistribute and change under these terms.

To do so, attach the following notices to the program. It is safest
to attach them to the start of each source file to most effectively
convey the exclusion of warranty; and each file should have at least
the "copyright" line and a pointer to where the full notice is found.

{description}
Copyright (C) {year} {fullname}

This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.

This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.

You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.

Also add information on how to contact you by electronic and paper mail.

If the program is interactive, make it output a short notice like this
when it starts in an interactive mode:

Gnomovision version 69, Copyright (C) year name of author
Gnomovision comes with ABSOLUTELY NO WARRANTY; for details type `show w'.
This is free software, and you are welcome to redistribute it
under certain conditions; type `show c' for details.

The hypothetical commands `show w' and `show c' should show the appropriate
parts of the General Public License. Of course, the commands you use may
be called something other than `show w' and `show c'; they could even be
mouse-clicks or menu items--whatever suits your program.

You should also get your employer (if you work as a programmer) or your
school, if any, to sign a "copyright disclaimer" for the program, if
necessary. Here is a sample; alter the names:

Yoyodyne, Inc., hereby disclaims all copyright interest in the program
`Gnomovision' (which makes passes at compilers) written by James Hacker.

{signature of Ty Coon}, 1 April 1989
Ty Coon, President of Vice

This General Public License does not permit incorporating your program into
proprietary programs. If your program is a subroutine library, you may
consider it more useful to permit linking proprietary applications with the
library. If this is what you want to do, use the GNU Lesser General
Public License instead of this License.

4 changes: 2 additions & 2 deletions Makefile.am
@@ -1,6 +1,6 @@
ACLOCAL_AMFLAGS = -I m4 --install
bin_PROGRAMS = miRA
miRA_SOURCES = src/main.c src/help.c src/cluster.c src/parse_sam.c src/errors.c src/vfold.c src/bed_file_io.c
miRA_SOURCES = src/main.c src/help.c src/cluster.c src/parse_sam.c src/errors.c src/vfold.c src/bed.c src/fasta.c
miRA_CFLAGS = -std=c11 -I src/ViennaRNA-2.1.8/H/
miRA_CPPFLAGS = -DDEBUG
miRA_LDFLAGS = -L src/ViennaRNA-2.1.8/lib/
Expand All @@ -11,7 +11,7 @@ miRA_LDFLAGS = -L src/ViennaRNA-2.1.8/lib/
EXTRA_PROGRAMS = miRAtest


miRAtest_SOURCES = test/main.c test/testerino.c test/test_cluster.c test/test_parse_sam.c src/errors.c src/parse_sam.c src/cluster.c src/vfold.c src/bed_file_io.c
miRAtest_SOURCES = test/main.c test/testerino.c test/test_cluster.c test/test_parse_sam.c test/test_bed_file_io.c src/errors.c src/parse_sam.c src/cluster.c src/vfold.c src/bed.c src/fasta.c test/test_fasta.c
miRAtest_CFLAGS = -std=c11

.PHONY: clean test
Expand Down
Binary file modified miRAtest
Binary file not shown.
64 changes: 39 additions & 25 deletions src/bed_file_io.c → src/bed.c
Expand Up @@ -2,7 +2,7 @@
#include <stdio.h>
#include "errors.h"
#include "cluster.h"
#include "bed_file_io.h"
#include "bed.h"
#include "defs.h"

int write_bed_file(char *filename, struct cluster_list *list) {
Expand Down Expand Up @@ -99,32 +99,46 @@ int parse_bed_line(struct cluster **result, char *line) {
free(tokens);
return E_INVALID_BED_LINE;
}
/* discard unneeded tokens */
free(tokens[4]);
free(tokens[8]);

char *check = NULL;
c->chrom = tokens[0];
c->start = strtol(tokens[1], &check, 10);
if (check == tokens[1] || *check != 0) {
goto line_invalid;
}

return E_SUCCESS;
}
/*
c->end = strtol(tokens[2], &check, 10);
if (check == tokens[2] || *check != 0) {
goto line_invalid;
}
/*skip the name of the cluster and only parse the id */
c->id = strtol(tokens[3] + 8, &check, 10);
if (check == tokens[3] + 8 || *check != 0) {
goto line_invalid;
}

free(tokens[4]);
c->strand = *tokens[5];
c->flank_start = strtol(tokens[6], &check, 10);
if (check == tokens[6] || *check != 0) {
goto line_invalid;
}
c->flank_start = strtol(tokens[7], &check, 10);
if (check == tokens[7] || *check != 0) {
goto line_invalid;
}
free(tokens[8]);
c->readcount = strtol(tokens[9], &check, 10);
if (check == tokens[9] || *check != 0) {
goto line_invalid;
}
*result = c;

*/
return E_SUCCESS;
line_invalid:
for (int i = 0; i < num_entries; i++) {
free(tokens[i]);
}
free(tokens);
return E_INVALID_BED_LINE;
}
2 changes: 2 additions & 0 deletions src/bed_file_io.h → src/bed.h
@@ -1,3 +1,5 @@
#include "cluster.h"

#ifndef BED_FILE_IO_H
#define BED_FILE_IO_H

Expand Down
Binary file removed src/bed_file_io
Binary file not shown.
2 changes: 1 addition & 1 deletion src/cluster.c
Expand Up @@ -4,7 +4,7 @@
#include <getopt.h>
#include "cluster.h"
#include "parse_sam.h"
#include "bed_file_io.h"
#include "bed.h"
#include "errors.h"
#include "string.h"
#include "uthash.h"
Expand Down
2 changes: 2 additions & 0 deletions src/errors.c
Expand Up @@ -12,6 +12,8 @@ struct _errordesc errordesc[] = {
{E_CHROMOSOME_NOT_FOUND, "The Chromosome was not found as a @SQ entry"},
{E_FILE_WRITING_FAILED, "Failed to write the output File"},
{E_INVALID_BED_LINE, "The line of the BED file is invalid"},
{E_UNKNOWN_FILE_IO_ERROR, "An unknown error occurred reading a file"},
{E_INVALID_FASTA_FILE, "Given fasta file is invalid"},
{E_MALLOC_FAIL, "malloc failed, check available memory"},
{E_REALLOC_FAIL, "realloc failed, check available memory"},
{E_UNKNOWN, "An unknown error occured"}};
Expand Down
2 changes: 2 additions & 0 deletions src/errors.h
Expand Up @@ -12,6 +12,8 @@ enum _error {
E_CHROMOSOME_NOT_FOUND = -7,
E_FILE_WRITING_FAILED = -8,
E_INVALID_BED_LINE = -9,
E_UNKNOWN_FILE_IO_ERROR = -10,
E_INVALID_FASTA_FILE = -11,

E_MALLOC_FAIL = -30,
E_REALLOC_FAIL = -31,
Expand Down
123 changes: 123 additions & 0 deletions src/fasta.c
@@ -0,0 +1,123 @@
#include "fasta.h"
#include "errors.h"
#include "fasta.h"
#include <stdlib.h>
#include <stdio.h>

int read_fasta_file(struct genome_sequence **sequence_table, char *filename) {
static const char name_marker = '>';
static const int MAXLINELENGHT = 2048;
struct genome_sequence *current_seq = NULL;
int err, l;
char *sep;
char line[MAXLINELENGHT];
FILE *fp = fopen(filename, "r");
if (fp == NULL) {
return E_FILE_NOT_FOUND;
}
while (fgets(line, sizeof(line), fp) != NULL) {
if (line[0] == name_marker) {
if (current_seq != NULL) {
HASH_ADD_STR(*sequence_table, chrom, current_seq);
}
create_genome_sequence(&current_seq);
sep = strchr(line, ' ');
if (sep == NULL) {
sep = strchr(line, '\n');
}
if (sep == NULL) {
sep = strchr(line, '\0');
}
if (sep == NULL) {
err = E_UNKNOWN_FILE_IO_ERROR;
goto parse_error;
}
l = sep - line - 1;
memcpy(current_seq->chrom, line + 1, l);
current_seq->chrom[l] = 0;
continue;
}
sep = strchr(line, '\n');
if (sep == NULL) {
sep = strchr(line, '\0');
}
if (sep == NULL) {
err = E_UNKNOWN_FILE_IO_ERROR;
goto parse_error;
}
err = append_to_genome_sequence(current_seq, line, sep - line);
if (err) {
goto parse_error;
}
}
HASH_ADD_STR(*sequence_table, chrom, current_seq);
fclose(fp);
return E_SUCCESS;

parse_error:
free_sequence_table(*sequence_table);
fclose(fp);
return err;
}

int create_genome_sequence(struct genome_sequence **seq) {
static const int STARTINGSIZE = 4096;
struct genome_sequence *s =
(struct genome_sequence *)malloc(sizeof(struct genome_sequence));
if (s == NULL) {
return E_MALLOC_FAIL;
}
s->data = (char *)malloc(STARTINGSIZE * sizeof(char));
if (s->data == NULL) {
free(s);
return E_MALLOC_FAIL;
}
s->capacity = STARTINGSIZE;
s->n = 0;
*seq = s;
return E_SUCCESS;
}
int strip_newlines(char **dest, size_t *nsize, char *buffer, size_t size) {
char *new_buffer = (char *)malloc(size * sizeof(char));
size_t new_size = 0;
for (size_t i = 0; i < size; i++) {
if (buffer[i] != '\n') {
new_buffer[new_size++] = buffer[i];
}
}
char *tmp = (char *)realloc(new_buffer, new_size * sizeof(char));
if (tmp == NULL) {
free(new_buffer);
return E_REALLOC_FAIL;
}
*dest = tmp;
*nsize = new_size;
return E_SUCCESS;
}

int append_to_genome_sequence(struct genome_sequence *seq, char *data,
size_t size) {
if (seq == NULL)
return E_SUCCESS;
while (seq->capacity - seq->n < size) {
seq->capacity *= 2;
char *tmp = (char *)realloc(seq->data, seq->capacity * sizeof(char));
if (tmp == NULL) {
return E_REALLOC_FAIL;
}
seq->data = tmp;
}
memcpy(seq->data + seq->n, data, size);
seq->n += size;
return E_SUCCESS;
}

int free_sequence_table(struct genome_sequence *table) {
struct genome_sequence *s = NULL;
struct genome_sequence *tmp = NULL;
HASH_ITER(hh, table, s, tmp) {
HASH_DEL(table, s);
free(s);
}
return E_SUCCESS;
}
21 changes: 21 additions & 0 deletions src/fasta.h
@@ -0,0 +1,21 @@
#include <stddef.h>
#include "uthash.h"

#ifndef FASTA_H
#define FASTA_H

struct genome_sequence {
char chrom[1024];
char *data;
size_t n;
size_t capacity;
UT_hash_handle hh;
};

int read_fasta_file(struct genome_sequence **sequence_table, char *filename);
int create_genome_sequence(struct genome_sequence **seq);
int strip_newlines(char **dest, size_t *nsize, char *buffer, size_t size);
int append_to_genome_sequence(struct genome_sequence *seq, char *data,
size_t size);
int free_sequence_table(struct genome_sequence *table);
#endif

0 comments on commit a23c2fd

Please sign in to comment.