In [1]:
import torch
from wavenet_model import *
from audio_data import WavenetDataset
from wavenet_training import *
from model_logging import *

# modified wavenet file implementation from: https://github.com/Vichoko/pytorch-wavenet/tree/master

dtype = torch.FloatTensor  # data type
ltype = torch.LongTensor  # label type

use_cuda = torch.cuda.is_available()
if use_cuda:
    print('use gpu')
    dtype = torch.cuda.FloatTensor
    ltype = torch.cuda.LongTensor

model = WaveNetModel(layers=10,
                     blocks=3,
                     dilation_channels=32,
                     residual_channels=32,
                     skip_channels=1024,
                     end_channels=512,
                     output_length=16,
                     dtype=dtype,
                     bias=True)
model.cuda()
print('model: ', model)
print('receptive field: ', model.receptive_field)
print('parameter count: ', model.parameter_count())

data = WavenetDataset(dataset_file='./example.npz',
                      item_length=model.receptive_field + model.output_length - 1,
                      target_length=model.output_length,
                      file_location='./unpacked_data',
                      test_stride=500)
print('the dataset has ' + str(len(data)) + ' items')


def generate_and_log_samples(step):
    sample_length = 32000
    gen_model = load_latest_model_from('snapshots', use_cuda=False)
    print("start generating...")
    samples = generate_audio(gen_model,
                             length=sample_length,
                             temperatures=[0.5])
    tf_samples = tf.convert_to_tensor(samples, dtype=tf.float32)
    # logger.audio_summary('temperature_0.5', tf_samples, step, sr=16000)

    samples = generate_audio(gen_model,
                             length=sample_length,
                             temperatures=[1.])
    tf_samples = tf.convert_to_tensor(samples, dtype=tf.float32)
    # logger.audio_summary('temperature_1.0', tf_samples, step, sr=16000)
    print("audio clips generated")


trainer = WavenetTrainer(model=model,
                         dataset=data,
                         lr=0.001,
                         snapshot_path='snapshots',
                         snapshot_name='birdset_model',
                         snapshot_interval=1000,
                         dtype=dtype,
                         ltype=ltype)

print('start training...')
start_data = data[250000][0] 
display(data)
trainer.train(batch_size=16,
              epochs=12)

use gpu
model:  WaveNetModel(
  (filter_convs): ModuleList(
    (0-29): 30 x Conv1d(32, 32, kernel_size=(2,), stride=(1,))
  )
  (gate_convs): ModuleList(
    (0-29): 30 x Conv1d(32, 32, kernel_size=(2,), stride=(1,))
  )
  (residual_convs): ModuleList(
    (0-29): 30 x Conv1d(32, 32, kernel_size=(1,), stride=(1,))
  )
  (skip_convs): ModuleList(
    (0-29): 30 x Conv1d(32, 1024, kernel_size=(1,), stride=(1,))
  )
  (start_conv): Conv1d(256, 32, kernel_size=(1,), stride=(1,))
  (end_conv_1): Conv1d(1024, 512, kernel_size=(1,), stride=(1,))
  (end_conv_2): Conv1d(512, 256, kernel_size=(1,), stride=(1,))
)
receptive field:  3070
parameter count:  1834592
one hot input
the dataset has 255490 items


  self.data = Variable(dtype(num_channels, max_length).zero_())


start training...
epoch 0
loss at step 50: 3.7523823404312133
one training step does take approximately 0.25975210666656495 seconds)
loss at step 100: 3.3060987424850463
loss at step 150: 3.11121160030365
loss at step 200: 3.0084674072265627


100%|██████████| 32/32 [00:03<00:00, 10.61it/s]


validation loss: 2.7252301275730133
validation accuracy: 20.751953125%
loss at step 250: 3.0322255754470824
loss at step 300: 2.982674684524536
loss at step 350: 2.926995987892151
loss at step 400: 2.9541908359527587


100%|██████████| 32/32 [00:03<00:00, 10.46it/s]


validation loss: 2.4927340783178806
validation accuracy: 26.806640625%
loss at step 450: 2.8708009099960328
loss at step 500: 2.725304355621338
loss at step 550: 2.744828405380249
loss at step 600: 2.698204131126404


100%|██████████| 32/32 [00:02<00:00, 10.72it/s]


validation loss: 2.28435605019331
validation accuracy: 28.06396484375%
loss at step 650: 2.5216562509536744
loss at step 700: 2.5633893966674806
loss at step 750: 2.5318618059158324
loss at step 800: 2.411579170227051


100%|██████████| 32/32 [00:02<00:00, 10.75it/s]


validation loss: 2.1338761411607265
validation accuracy: 30.76171875%
loss at step 850: 2.455847804546356
loss at step 900: 2.420767467021942
loss at step 950: 2.339075019359589
loss at step 1000: 2.392481310367584


100%|██████████| 32/32 [00:02<00:00, 10.83it/s]


validation loss: 2.075526785105467
validation accuracy: 31.23779296875%
loss at step 1050: 2.401376276016235
loss at step 1100: 2.376470251083374
loss at step 1150: 2.4124598360061644
loss at step 1200: 2.3052714133262633


100%|██████████| 32/32 [00:02<00:00, 10.87it/s]


validation loss: 2.026743870228529
validation accuracy: 31.43310546875%
loss at step 1250: 2.356872398853302
loss at step 1300: 2.3398267674446105
loss at step 1350: 2.3631361627578737
loss at step 1400: 2.381525673866272


100%|██████████| 32/32 [00:02<00:00, 10.87it/s]


validation loss: 2.0052985846996307
validation accuracy: 31.7138671875%
loss at step 1450: 2.325929641723633
loss at step 1500: 2.3262641048431396
loss at step 1550: 2.3162837624549866
loss at step 1600: 2.357054069042206


100%|██████████| 32/32 [00:02<00:00, 10.86it/s]


validation loss: 1.9523143209517002
validation accuracy: 33.70361328125%
loss at step 1650: 2.3001373171806336
loss at step 1700: 2.278665375709534
loss at step 1750: 2.289828083515167
loss at step 1800: 2.2546091938018797


100%|██████████| 32/32 [00:02<00:00, 10.87it/s]


validation loss: 1.9249052926898003
validation accuracy: 34.38720703125%
loss at step 1850: 2.278935272693634
loss at step 1900: 2.3040567779541017
loss at step 1950: 2.2493331909179686
loss at step 2000: 2.2397066235542296


100%|██████████| 32/32 [00:02<00:00, 10.87it/s]


validation loss: 1.9045193456113338
validation accuracy: 34.86328125%
loss at step 2050: 2.260926830768585
loss at step 2100: 2.333896429538727
loss at step 2150: 2.2659362840652464
loss at step 2200: 2.299089729785919


100%|██████████| 32/32 [00:03<00:00, 10.66it/s]


validation loss: 1.9239685386419296
validation accuracy: 33.31298828125%
loss at step 2250: 2.264460108280182
loss at step 2300: 2.228082776069641
loss at step 2350: 2.2385959792137147
loss at step 2400: 2.1913040041923524


100%|██████████| 32/32 [00:02<00:00, 10.92it/s]


validation loss: 1.9190063066780567
validation accuracy: 33.49609375%
loss at step 2450: 2.2558091926574706
loss at step 2500: 2.1886008048057555
loss at step 2550: 2.204746332168579
loss at step 2600: 2.295921447277069


100%|██████████| 32/32 [00:02<00:00, 11.02it/s]


validation loss: 1.8833548575639725
validation accuracy: 34.53369140625%
loss at step 2650: 2.230359013080597
loss at step 2700: 2.256378755569458
loss at step 2750: 2.1840563154220582
loss at step 2800: 2.216467945575714


100%|██████████| 32/32 [00:02<00:00, 10.89it/s]


validation loss: 1.8637284226715565
validation accuracy: 34.75341796875%
loss at step 2850: 2.2385149598121643
loss at step 2900: 2.2489780449867247
loss at step 2950: 2.2974090433120726
loss at step 3000: 2.24168616771698


100%|██████████| 32/32 [00:02<00:00, 10.70it/s]


validation loss: 1.8640907481312752
validation accuracy: 34.3994140625%
loss at step 3050: 2.172415881156921
loss at step 3100: 2.202288272380829


KeyboardInterrupt: 

In [7]:
import torch
from wavenet_model import *
from audio_data import WavenetDataset
from wavenet_training import *
from model_logging import *
dtype = torch.FloatTensor  # data type
ltype = torch.LongTensor  # label type

use_cuda = torch.cuda.is_available()
if use_cuda:
    print('use gpu')
    dtype = torch.cuda.FloatTensor
    ltype = torch.cuda.LongTensor
device = torch.device("cuda")

model = WaveNetModel(layers=10,
                     blocks=3,
                     dilation_channels=32,
                     residual_channels=32,
                     skip_channels=1024,
                     end_channels=512,
                     output_length=16,
                     dtype=dtype,
                     bias=True)
model.load_state_dict(torch.load('birdset_modelwavenet_model.pth', weights_only=False))
model.eval()
model.cuda()

data = WavenetDataset(dataset_file='./example.npz',
                      item_length=model.receptive_field + model.output_length - 1,
                      target_length=model.output_length,
                      file_location='./unpacked_data',
                      test_stride=500)
print('the dataset has ' + str(len(data)) + ' items')



start_data = data[250000][0]
start_data = torch.max(start_data, 0)[1].to('cuda')
def prog_callback(step, total_steps):
    print(str(100 * step // total_steps) + "% generated")

generated = model.generate_fast(num_samples=32000,
                                 first_samples=start_data,
                                 progress_callback=prog_callback,
                                 progress_interval=1000,
                                 temperature=1.0,
                                 regularize=0.)

use gpu
one hot input
the dataset has 255490 items
torch.Size([1, 256, 1])
0% generated
2% generated
5% generated
8% generated
one generating step does take approximately 0.011234934329986573 seconds)
11% generated
14% generated
17% generated
19% generated
22% generated
25% generated
28% generated
31% generated
34% generated
37% generated
39% generated
42% generated
45% generated
48% generated
51% generated
54% generated
57% generated
59% generated
62% generated
65% generated
68% generated
71% generated
74% generated
76% generated
79% generated
82% generated
85% generated
88% generated
91% generated
94% generated
96% generated
99% generated


In [8]:
import IPython.display as ipd
import soundfile as sf

print(generated.shape)
ipd.Audio(generated, rate=16000)

sf.write('output_file.wav', generated, 16000)

(32000,)
