-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtrain.py
376 lines (278 loc) · 13 KB
/
train.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
"""
Speech Command Recognition with torchaudio
******************************************
This tutorial will show you how to correctly format an audio dataset and
then train/test an audio classifier network on the dataset.
Colab has GPU option available. In the menu tabs, select “Runtime” then
“Change runtime type”. In the pop-up that follows, you can choose GPU.
After the change, your runtime should automatically restart (which means
information from executed cells disappear).
First, let’s import the common torch packages such as
`torchaudio <https://github.com/pytorch/audio>`__ that can be installed
by following the instructions on the website.
"""
# Uncomment the following line to run in Google Colab
# CPU:
# !pip install torch==1.7.0+cpu torchvision==0.8.1+cpu torchaudio==0.7.0 -f https://download.pytorch.org/whl/torch_stable.html
# GPU:
# !pip install torch==1.7.0+cu101 torchvision==0.8.1+cu101 torchaudio==0.7.0 -f https://download.pytorch.org/whl/torch_stable.html
# For interactive demo at the end:
# !pip install pydub
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.tensorboard import SummaryWriter
import torchaudio
import matplotlib.pyplot as plt
import os
import time
from tqdm import tqdm
import data
import model
if __name__ == "__main__":
save_path = "runs/test_{}".format(int(time.time()))
######################################################################
# Let’s check if a CUDA GPU is available and select our device. Running
# the network on a GPU will greatly decrease the training/testing runtime.
#
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# device = torch.device("cpu")
print(device)
######################################################################
# Importing the Dataset
# ---------------------
#
# We use torchaudio to download and represent the dataset. Here we use
# `SpeechCommands <https://arxiv.org/abs/1804.03209>`__, which is a
# datasets of 35 commands spoken by different people. The dataset
# ``SPEECHCOMMANDS`` is a ``torch.utils.data.Dataset`` version of the
# dataset. In this dataset, all audio files are about 1 second long (and
# so about 16000 time frames long).
#
# The actual loading and formatting steps happen when a data point is
# being accessed, and torchaudio takes care of converting the audio files
# to tensors. If one wants to load an audio file directly instead,
# ``torchaudio.load()`` can be used. It returns a tuple containing the
# newly created tensor along with the sampling frequency of the audio file
# (16kHz for SpeechCommands).
#
# Going back to the dataset, here we create a subclass that splits it into
# standard training, validation, testing subsets.
#
# Create training and testing split of the data. We do not use validation in this tutorial.
train_set = data.SubsetSC("training")
test_set = data.SubsetSC("testing")
waveform, sample_rate, label, speaker_id, utterance_number = train_set[0]
######################################################################
# A data point in the SPEECHCOMMANDS dataset is a tuple made of a waveform
# (the audio signal), the sample rate, the utterance (label), the ID of
# the speaker, the number of the utterance.
#
print("Shape of waveform: {}".format(waveform.size()))
print("Sample rate of waveform: {}".format(sample_rate))
# plt.plot(waveform.t().numpy());
######################################################################
# Let’s find the list of labels available in the dataset.
#
labels = sorted(list(set(datapoint[2] for datapoint in train_set)))
labels
######################################################################
# The 35 audio labels are commands that are said by users. The first few
# files are people saying “marvin”.
#
waveform_first, *_ = train_set[0]
# ipd.Audio(waveform_first.numpy(), rate=sample_rate)
waveform_second, *_ = train_set[1]
# ipd.Audio(waveform_second.numpy(), rate=sample_rate)
######################################################################
# The last file is someone saying “visual”.
#
waveform_last, *_ = train_set[-1]
# ipd.Audio(waveform_last.numpy(), rate=sample_rate)
######################################################################
# Formatting the Data
# -------------------
#
# This is a good place to apply transformations to the data. For the
# waveform, we downsample the audio for faster processing without losing
# too much of the classification power.
#
# We don’t need to apply other transformations here. It is common for some
# datasets though to have to reduce the number of channels (say from
# stereo to mono) by either taking the mean along the channel dimension,
# or simply keeping only one of the channels. Since SpeechCommands uses a
# single channel for audio, this is not needed here.
#
transform = data.get_transform(sample_rate)
transformed = transform(waveform)
# ipd.Audio(transformed.numpy(), rate=new_sample_rate)
######################################################################
# We are encoding each word using its index in the list of labels.
#
word_start = "yes"
index = data.label_to_index(word_start, labels)
word_recovered = data.index_to_label(index, labels)
print(word_start, "-->", index, "-->", word_recovered)
######################################################################
# To turn a list of data point made of audio recordings and utterances
# into two batched tensors for the model, we implement a collate function
# which is used by the PyTorch DataLoader that allows us to iterate over a
# dataset by batches. Please see `the
# documentation <https://pytorch.org/docs/stable/data.html#working-with-collate-fn>`__
# for more information about working with a collate function.
#
# In the collate function, we also apply the resampling, and the text
# encoding.
#
def pad_sequence(batch):
# Make all tensor in a batch the same length by padding with zeros
batch = [item.t() for item in batch]
batch = torch.nn.utils.rnn.pad_sequence(batch, batch_first=True, padding_value=0.)
return batch.permute(0, 2, 1)
def collate_fn(batch):
# A data tuple has the form:
# waveform, sample_rate, label, speaker_id, utterance_number
tensors, targets = [], []
# Gather in lists, and encode labels as indices
for waveform, _, label, *_ in batch:
tensors += [waveform]
targets += [data.label_to_index(label, labels)]
# Group the list of tensors into a batched tensor
tensors = pad_sequence(tensors)
targets = torch.stack(targets)
return tensors, targets
batch_size = 256
if device == "cuda":
num_workers = 1
pin_memory = True
else:
num_workers = 0
pin_memory = False
train_loader = torch.utils.data.DataLoader(
train_set,
batch_size=batch_size,
shuffle=True,
collate_fn=collate_fn,
num_workers=num_workers,
pin_memory=pin_memory,
)
test_loader = torch.utils.data.DataLoader(
test_set,
batch_size=batch_size,
shuffle=False,
drop_last=False,
collate_fn=collate_fn,
num_workers=num_workers,
pin_memory=pin_memory,
)
# set up model
audio_model = model.M5(n_input=transformed.shape[0], n_output=len(labels))
audio_model.to(device)
print(model)
def count_parameters(audio_model):
return sum(p.numel() for p in audio_model.parameters() if p.requires_grad)
n = count_parameters(audio_model)
print("Number of parameters: %s" % n)
######################################################################
# We will use the same optimization technique used in the paper, an Adam
# optimizer with weight decay set to 0.0001. At first, we will train with
# a learning rate of 0.01, but we will use a ``scheduler`` to decrease it
# to 0.001 during training after 20 epochs.
#
optimizer = optim.Adam(audio_model.parameters(), lr=0.01, weight_decay=0.0001)
scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=20, gamma=0.1) # reduce the learning after 20 epochs by a factor of 10
######################################################################
# Training and Testing the Network
# --------------------------------
#
# Now let’s define a training function that will feed our training data
# into the model and perform the backward pass and optimization steps. For
# training, the loss we will use is the negative log-likelihood. The
# network will then be tested after each epoch to see how the accuracy
# varies during the training.
#
def train(audio_model, epoch, log_interval):
audio_model.train()
for batch_idx, (data, target) in enumerate(train_loader):
data = data.to(device)
target = target.to(device)
# apply transform and model on whole batch directly on device
data = transform(data)
output = audio_model(data)
# negative log-likelihood for a tensor of size (batch x 1 x n_output)
loss = F.nll_loss(output.squeeze(), target)
optimizer.zero_grad()
loss.backward()
optimizer.step()
# print training stats
if batch_idx % log_interval == 0:
print(f"Train Epoch: {epoch} [{batch_idx * len(data)}/{len(train_loader.dataset)} ({100. * batch_idx / len(train_loader):.0f}%)]\tLoss: {loss.item():.6f}")
# update progress bar
pbar.update(pbar_update)
# record loss
losses.append(loss.item())
tensorboard_writer.add_scalar("train loss", losses[-1], epoch)
# save model
checkpoints_path = os.path.join(save_path, "checkpoints")
if not os.path.exists(checkpoints_path):
os.makedirs(checkpoints_path)
model_save_path = os.path.join(checkpoints_path, "model_{}.pt".format(epoch))
print("saving to", model_save_path, "...")
torch.save({
'epoch': epoch,
'model_state_dict': audio_model.state_dict(),
'optimizer_state_dict': optimizer.state_dict(),
'loss': loss,
}, model_save_path)
print("...saved")
######################################################################
# Now that we have a training function, we need to make one for testing
# the networks accuracy. We will set the model to ``eval()`` mode and then
# run inference on the test dataset. Calling ``eval()`` sets the training
# variable in all modules in the network to false. Certain layers like
# batch normalization and dropout layers behave differently during
# training so this step is crucial for getting correct results.
#
def number_of_correct(pred, target):
# count number of correct predictions
return pred.squeeze().eq(target).sum().item()
def test(audio_model, epoch):
audio_model.eval()
correct = 0
for data, target in test_loader:
data = data.to(device)
target = target.to(device)
# apply transform and model on whole batch directly on device
data = transform(data)
output = audio_model(data)
pred = model.get_likely_index(output)
correct += number_of_correct(pred, target)
# update progress bar
pbar.update(pbar_update)
print(f"\nTest Epoch: {epoch}\tAccuracy: {correct}/{len(test_loader.dataset)} ({100. * correct / len(test_loader.dataset):.0f}%)\n")
tensorboard_writer.add_scalar("test accuracy", correct / len(test_loader.dataset), epoch)
# set up tensorboard
tensorboard_writer = SummaryWriter(save_path)
# tensorboard_writer.add_graph(model, transform(train_set[0][0]))
######################################################################
# Finally, we can train and test the network. We will train the network
# for ten epochs then reduce the learn rate and train for ten more epochs.
# The network will be tested after each epoch to see how the accuracy
# varies during the training.
#
log_interval = 20
n_epoch = 50
pbar_update = 1 / (len(train_loader) + len(test_loader))
losses = []
# The transform needs to live on the same device as the model and the data.
transform = transform.to(device)
with tqdm(total=n_epoch) as pbar:
for epoch in range(1, n_epoch + 1):
train(audio_model, epoch, log_interval)
test(audio_model, epoch)
scheduler.step()
tensorboard_writer.close()
# Let's plot the training loss versus the number of iteration.
# plt.plot(losses);
# plt.title("training loss");