forked from NVIDIA/NeMo
-
Notifications
You must be signed in to change notification settings - Fork 0
/
quartznet_speech_recognition.yaml
101 lines (90 loc) · 2.02 KB
/
quartznet_speech_recognition.yaml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
model: "QuartzNet"
sample_rate: 16000
dropout: &drop 0.0
rep: &rep 1
n_mels: &n_mels 64
se: &se true
kernel_size_factor: &kfactor 2.0
AudioToTextDataLayer:
train:
shuffle: true
eval:
shuffle: false
max_duration: null
AudioToMelSpectrogramPreprocessor:
normalize: "per_feature"
window_size: 0.02
window_stride: 0.01
window: "hann"
features: 64
n_fft: 512
frame_splicing: 1
dither: 0.00001
stft_conv: true
SpectrogramAugmentation:
rect_masks: 5
rect_time: 120
rect_freq: 50
JasperEncoder:
feat_in: *n_mels
activation: "relu"
conv_mask: true
jasper:
- filters: 32
repeat: 1
kernel: [11]
stride: [1]
dilation: [1]
dropout: *drop
residual: false
separable: true
se: *se
kernel_size_factor: *kfactor
- filters: 32
repeat: *rep
kernel: [11]
stride: [1]
dilation: [1]
dropout: *drop
residual: true
separable: true
se: *se
kernel_size_factor: *kfactor
- filters: 32
repeat: *rep
kernel: [13]
stride: [1]
dilation: [1]
dropout: *drop
residual: true
separable: true
se: *se
kernel_size_factor: *kfactor
- filters: 32
repeat: *rep
kernel: [17]
stride: [1]
dilation: [1]
dropout: *drop
residual: true
separable: true
se: *se
kernel_size_factor: *kfactor
- filters: 32
repeat: 1
kernel: [29]
stride: [1]
dilation: [2]
dropout: *drop
residual: false
separable: true
se: *se
kernel_size_factor: *kfactor
- filters: 32
repeat: 1
kernel: [1]
stride: [1]
dilation: [1]
dropout: *drop
residual: false
labels: ["dog", "cat"]