/
wav.go
273 lines (235 loc) · 8.65 KB
/
wav.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
package audio
import (
"encoding/binary"
"io"
"io/ioutil"
"math"
"github.com/auroraapi/aurora-go/errors"
)
// WAV-related constants.
const (
// DefaultNumChannels is 1 (mono audio)
DefaultNumChannels uint16 = 1
// DefaultSampleRate is 16KHz
DefaultSampleRate uint32 = 16000
// DefaultAudioFormat is 1 (raw, uncompressed PCM waveforms)
DefaultAudioFormat uint16 = 1
// DefaultBitsPerSample is 16 (2 bytes per sample).
DefaultBitsPerSample uint16 = 16
)
// WAV represents a PCM audio file in the WAV container format. It keeps
// a high-level description of the parameters of the file, along with the
// raw audio bytes, until it needs to be written to a file, stream, or array.
// It is based on the WAV formatting as specified: http://soundfile.sapp.org/doc/WaveFormat/
type WAV struct {
// NumChannels is the number of channels the WAV file has. 1 = mono,
// 2 = stereo, etc. This affects the block align and also number of
// bytes per sample: (BitsPerSample / 8) * NumChannels.
NumChannels uint16
// SampleRate is the number of samples taken per second.
SampleRate uint32
// AudioFormat is the type of Audio that is encoded in the WAV file.
// In most scenarios, this will be 1 (1 = raw, uncompressed PCM audio),
// since WAV doesn't support compression.
AudioFormat uint16
// BitsPerSample is the width of each sample. 16 bits means each sample
// is two bytes.
BitsPerSample uint16
// audioData is the raw audio data stored in the WAV file.
audioData []byte
}
// WAVParams are a set of parameters used to create a WAV file. Its fields
// correspond directly to the WAV file.
type WAVParams struct {
NumChannels uint16
SampleRate uint32
BitsPerSample uint16
AudioData []byte
}
// NewWAV returns a new, empty WAV file using the default parameters.
func NewWAV() *WAV {
// create a new default WAV
return &WAV{
NumChannels: DefaultNumChannels,
SampleRate: DefaultSampleRate,
AudioFormat: DefaultAudioFormat,
BitsPerSample: DefaultBitsPerSample,
audioData: make([]byte, 0),
}
}
// NewWAVFromParams returns a new WAV file from the passed in parameters
// If any of the parameters are 0, it will be given the default value.
func NewWAVFromParams(params *WAVParams) *WAV {
if params == nil {
return NewWAV()
}
if params.NumChannels == 0 {
params.NumChannels = DefaultNumChannels
}
if params.SampleRate == 0 {
params.SampleRate = DefaultSampleRate
}
if params.BitsPerSample == 0 {
params.BitsPerSample = DefaultBitsPerSample
}
if params.AudioData == nil || len(params.AudioData) == 0 {
params.AudioData = make([]byte, 0)
}
return &WAV{
NumChannels: params.NumChannels,
SampleRate: params.SampleRate,
AudioFormat: DefaultAudioFormat,
BitsPerSample: params.BitsPerSample,
audioData: params.AudioData,
}
}
// NewWAVFromData creates a WAV format struct from the given data buffer
// The buffer is broken up into its respective information and that
func NewWAVFromData(data []byte) (*WAV, error) {
// find the end of ChunkID denoted by RIFF
// This marks the beginning of the WAV file
i := 4
for i < len(data) && data[i-4] != 'R' || data[i-3] != 'I' || data[i-2] != 'F' || data[i-1] != 'F' {
i++
}
dataLen := len(data) - i
if dataLen <= 0 {
return nil, errors.NewFromErrorCodeInfo(errors.WAVCorruptFile, "The letters `RIFF` should exist from bytes 0 to 3 in big endian form from the start of the header to indicate that it is a RIFF header.")
}
// hOff is the header offset. Even though the header length is actually
// 44, we find where the data begins by looking for the letters
// "RIFF" which is from bytes 0 to 3. The variable i at this point
// pointing to right past the "RIFF" letters
hOff := i - 4
if (len(data) - hOff - 44) < 0 {
return nil, errors.NewFromErrorCode(errors.WAVCorruptFile)
}
// Verifies that "WAVE" letters exist in big endian form
if data[hOff+8] != 'W' || data[hOff+9] != 'A' || data[hOff+10] != 'V' || data[hOff+11] != 'E' {
return nil, errors.NewFromErrorCodeInfo(errors.WAVCorruptFile, "The letters `WAVE` should exist from bytes 8 to 11 in big endian form from the start of the header to indicate that it is a WAVE format file.")
}
// Verifies that "fmt " letters exist in big endian form
if data[hOff+12] != 'f' || data[hOff+13] != 'm' || data[hOff+14] != 't' || data[hOff+15] != ' ' {
return nil, errors.NewFromErrorCodeInfo(errors.WAVCorruptFile, "The letters `fmt ` should exist from bytes 12 to 15 in big endian form from the start of the header to indicate the subchunk 1 ID")
}
// Verifies that the "data" letters exist in big endian form
if data[hOff+36] != 'd' || data[hOff+37] != 'a' || data[hOff+38] != 't' || data[hOff+39] != 'a' {
return nil, errors.NewFromErrorCodeInfo(errors.WAVCorruptFile, "The letters `data` should exist from bytes 36 to 39 in big endian form from the start of the header to indicate the subchunk 2 ID.")
}
numChannels := binary.LittleEndian.Uint16(data[hOff+22 : hOff+24])
sampleRate := binary.LittleEndian.Uint32(data[hOff+24 : hOff+28])
bitsPerSample := binary.LittleEndian.Uint16(data[hOff+34 : hOff+36])
// The actual sound data begins at byte 44 from the beginning of the header
audioData := data[hOff+44:]
return &WAV{
NumChannels: numChannels,
SampleRate: sampleRate,
AudioFormat: DefaultAudioFormat,
BitsPerSample: bitsPerSample,
audioData: audioData,
}, nil
}
// NewWAVFromReader takes in a reader and creates a new WAV file.
func NewWAVFromReader(reader io.Reader) (*WAV, error) {
b, err := ioutil.ReadAll(reader)
if err != nil {
return nil, err
}
return NewWAVFromData(b)
}
// TrimSilent is called on a WAV struct to trim the silent portions from the
// ends of the file while leaving a certain amount of padding. The padding input is
// specified in seconds. The threshold input is a decimal (between 0 and 1) and is
// relative to the maximum amplitude of the waveform
func (w *WAV) TrimSilent(threshold float64, padding float64) {
// sample size in bytes
sampleSize := int(w.NumChannels * w.BitsPerSample / 8)
// number of bytes to examine in each step
step := 1024
// get max amplitude
maxPossibleAmp := math.Exp2(float64(w.BitsPerSample)) / 2.0
// silenceThresh is a percentage of the maximum wave height
silenceThresh := threshold * maxPossibleAmp
// Trimming the beginning
N1 := 0
for N1 < len(w.audioData) {
sampleRMS := rms(sampleSize, w.audioData[N1:N1+(sampleSize*step)])
if sampleRMS > silenceThresh {
break
}
N1 += sampleSize * step
}
// Trimming the end
N2 := len(w.audioData)
for N2 >= 0 {
sampleRMS := rms(sampleSize, w.audioData[N2-(sampleSize*step):N2])
if sampleRMS > silenceThresh {
break
}
N2 -= sampleSize * step
}
paddingSamples := int(padding * float64(w.SampleRate) * float64(sampleSize))
w.audioData = w.audioData[N1-paddingSamples : N2+paddingSamples]
}
// AddAudioData adds the passed-in audio bytes to the WAV struct
func (w *WAV) AddAudioData(d []byte) {
// add audio data to existing data
if d != nil && len(d) > 0 {
w.audioData = append(w.audioData, d...)
}
}
// AudioData returns the raw audio data
func (w *WAV) AudioData() []byte {
return w.audioData
}
// Data creates the header and data based on the WAV struct and returns
// a fully formatted WAV file
func (w *WAV) Data() []byte {
// find first data index
dataLen := len(w.audioData)
headerLen := 44
chunkSize := (dataLen + headerLen - 8)
// first create the header, then append the rest of the file
wav := make([]byte, headerLen)
// RIFF header
wav[0] = 'R'
wav[1] = 'I'
wav[2] = 'F'
wav[3] = 'F'
// chunk size
binary.LittleEndian.PutUint32(wav[4:8], uint32(chunkSize))
// Format (WAVE)
wav[8] = 'W'
wav[9] = 'A'
wav[10] = 'V'
wav[11] = 'E'
// Metadata subchunk ID ("fmt ")
wav[12] = 'f'
wav[13] = 'm'
wav[14] = 't'
wav[15] = ' '
// Metadata subchunk size (16)
binary.LittleEndian.PutUint32(wav[16:20], 16)
// Audio format (PCM = 1)
binary.LittleEndian.PutUint16(wav[20:22], w.AudioFormat)
// Num Channels (Mono = 1)
binary.LittleEndian.PutUint16(wav[22:24], w.NumChannels)
// Sample Rate (16000 Hz)
binary.LittleEndian.PutUint32(wav[24:28], w.SampleRate)
// Byte Rate = SampleRate * NumChannels * BitsPerSample/8 = 32000
byteRate := w.SampleRate * uint32(w.NumChannels) * uint32(w.BitsPerSample) / 8
binary.LittleEndian.PutUint32(wav[28:32], byteRate)
// Block Align = NumChannels * BitsPerSample/8
blockAlign := w.NumChannels * w.BitsPerSample / 8
binary.LittleEndian.PutUint16(wav[32:34], blockAlign)
// Bits per sample = 16
binary.LittleEndian.PutUint16(wav[34:36], w.BitsPerSample)
// Data subchunk ID ("data")
wav[36] = 'd'
wav[37] = 'a'
wav[38] = 't'
wav[39] = 'a'
// Data length
binary.LittleEndian.PutUint32(wav[40:44], uint32(dataLen))
return append(wav, w.audioData[0:]...)
}