-
Notifications
You must be signed in to change notification settings - Fork 0
/
fastq_file.rb
158 lines (142 loc) · 4.57 KB
/
fastq_file.rb
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
# Copyright 2014, 2015 Ryan Moore
# Contact: moorer@udel.edu
#
# This file is part of parse_fasta.
#
# parse_fasta is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# parse_fasta is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with parse_fasta. If not, see <http://www.gnu.org/licenses/>.
require 'zlib'
# Provides simple interface for parsing four-line-per-record fastq
# format files. Gzipped files are no problem.
class FastqFile < File
# Returns the records in the fastq file as a hash map with the
# headers as keys pointing to a hash map like so
# { "seq1" => { head: "seq1", seq: "ACTG", desc: "", qual: "II3*"} }
#
# @example Read a fastQ into a hash table.
# seqs = FastqFile.open('reads.fq.gz').to_hash
#
# @return [Hash] A hash with headers as keys, and a hash map as the
# value with keys :head, :seq, :desc, :qual, for header, sequence,
# description, and quality.
def to_hash
hash = {}
self.each_record do |head, seq, desc, qual|
hash[head] = { head: head, seq: seq, desc: desc, qual: qual }
end
hash
end
# Analagous to IO#each_line, #each_record is used to go through a
# fastq file record by record. It will accept gzipped files as well.
#
# @example Parsing a fastq file
# FastqFile.open('reads.fq').each_record do |head, seq, desc, qual|
# # do some fun stuff here!
# end
# @example Use the same syntax for gzipped files!
# FastqFile.open('reads.fq.gz').each_record do |head, seq, desc, qual|
# # do some fun stuff here!
# end
#
# @yield The header, sequence, description and quality string for
# each record in the fastq file to the block
# @yieldparam header [String] The header of the fastq record without
# the leading '@'
# @yieldparam sequence [Sequence] The sequence of the fastq record
# @yieldparam description [String] The description line of the fastq
# record without the leading '+'
# @yieldparam quality_string [Quality] The quality string of the
# fastq record
def each_record
count = 0
header = ''
sequence = ''
description = ''
quality = ''
begin
f = Zlib::GzipReader.open(self)
rescue Zlib::GzipFile::Error => e
f = self
end
f.each_line do |line|
line.chomp!
case count % 4
when 0
header = line[1..-1]
when 1
sequence = Sequence.new(line)
when 2
description = line[1..-1]
when 3
quality = Quality.new(line)
yield(header, sequence, description, quality)
end
count += 1
end
f.close if f.instance_of?(Zlib::GzipReader)
return f
end
# Fast version of #each_record
#
# @note If the fastQ file has spaces in the sequence, they will be
# retained. If this is a problem, use #each_record instead.
#
# @example Parsing a fastq file
# FastqFile.open('reads.fq').each_record_fast do |head, seq, desc, qual|
# # do some fun stuff here!
# end
# @example Use the same syntax for gzipped files!
# FastqFile.open('reads.fq.gz').each_record_fast do |head, seq, desc, qual|
# # do some fun stuff here!
# end
#
# @yield The header, sequence, description and quality string for
# each record in the fastq file to the block
#
# @yieldparam header [String] The header of the fastq record without
# the leading '@'
# @yieldparam sequence [String] The sequence of the fastq record
# @yieldparam description [String] The description line of the fastq
# record without the leading '+'
# @yieldparam quality_string [String] The quality string of the
# fastq record
def each_record_fast
count = 0
header = ''
sequence = ''
description = ''
quality = ''
begin
f = Zlib::GzipReader.open(self)
rescue Zlib::GzipFile::Error => e
f = self
end
f.each_line do |line|
line.chomp!
case count % 4
when 0
header = line[1..-1]
when 1
sequence = line
when 2
description = line[1..-1]
when 3
quality = line
yield(header, sequence, description, quality)
end
count += 1
end
f.close if f.instance_of?(Zlib::GzipReader)
return f
end
end