-
Notifications
You must be signed in to change notification settings - Fork 0
/
seq_file.rb
147 lines (137 loc) · 4.67 KB
/
seq_file.rb
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
# Copyright 2014, 2015 Ryan Moore
# Contact: moorer@udel.edu
#
# This file is part of parse_fasta.
#
# parse_fasta is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# parse_fasta is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with parse_fasta. If not, see <http://www.gnu.org/licenses/>.
# Provides a class that will parse either fastA or fastQ files,
# depending on what the user provides. Handles, gzipped files.
class SeqFile < File
# Returns the records in the sequence file as a hash map with the
# headers as keys and the Sequences as values. For a fastq file,
# acts the same as `FastaFile#to_hash`
#
# @example Read a fastA into a hash table.
# seqs = SeqFile.open('reads.fa').to_hash
#
# @return [Hash] A hash with headers as keys, sequences as the
# values (Sequence objects)
#
# @raise [ParseFasta::SequenceFormatError] if sequence has a '>',
# and file is a fastA file
def to_hash
first_char = get_first_char(self)
if first_char == '>'
FastaFile.open(self).to_hash
elsif first_char == '@'
FastqFile.open(self).to_hash
else
raise ArgumentError, "Input does not look like FASTA or FASTQ"
end
end
# Analagous to IO#each_line, #each_record will go through a fastA or
# fastQ file record by record.
#
# This #each_record is used in a similar fashion as
# FastaFile#each_record except that it yields the header and the
# sequence regardless of whether the input is a fastA file or a
# fastQ file.
#
# If the input is a fastQ file, this method will yield the header
# and the sequence and ignore the description and the quality
# string. This SeqFile class should only be used if your program
# needs to work on either fastA or fastQ files, thus it ignores the
# quality string and description and treats either file type as if
# it were a fastA file.
#
# If you need the description or quality, you should use
# FastqFile#each_record instead.
#
# @example Parse a gzipped fastA file
# SeqFile.open('reads.fa.gz').each_record do |head, seq|
# puts [head, seq.length].join "\t"
# end
#
# @example Parse an uncompressed fastQ file
# SeqFile.open('reads.fq.gz').each_record do |head, seq|
# puts [head, seq.length].join "\t"
# end
#
# @yieldparam header [String] The header of the record without the
# leading '>' or '@'
#
# @yieldparam sequence [Sequence] The sequence of the record.
#
# @raise [ParseFasta::SequenceFormatError] if sequence has a '>',
# and file is a fastA file
def each_record
first_char = get_first_char(self)
if first_char == '>'
FastaFile.open(self).each_record do |header, sequence|
yield(header, sequence)
end
elsif first_char == '@'
FastqFile.open(self).each_record do |head, seq, desc, qual|
yield(head, seq)
end
else
raise ArgumentError, "Input does not look like FASTA or FASTQ"
end
end
# Fast version of #each_record
#
# @note If the sequence file has spaces in the sequence, they will
# be retained. If this is a problem, use #each_record instead.
#
# @example Parse a gzipped fastA file
# SeqFile.open('reads.fa.gz').each_record_fast do |head, seq|
# puts [head, seq.length].join "\t"
# end
#
# @example Parse an uncompressed fastQ file
# SeqFile.open('reads.fq.gz').each_record_fast do |head, seq|
# puts [head, seq.length].join "\t"
# end
#
# @yieldparam header [String] The header of the record without the
# leading '>' or '@'
#
# @yieldparam sequence [String] The sequence of the record.
#
# @raise [ParseFasta::SequenceFormatError] if sequence has a '>',
# and file is a fastA file
def each_record_fast
first_char = get_first_char(self)
if first_char == '>'
FastaFile.open(self).each_record_fast do |header, sequence|
yield(header, sequence)
end
elsif first_char == '@'
FastqFile.open(self).each_record_fast do |head, seq, desc, qual|
yield(head, seq)
end
else
raise ArgumentError, "Input does not look like FASTA or FASTQ"
end
end
private
def get_first_char(f)
begin
handle = Zlib::GzipReader.open(f)
rescue Zlib::GzipFile::Error => e
handle = f
end
handle.each_line.peek[0]
end
end