In [1]:
import numpy as np
import torch
import torch.nn.functional as F
from torch import nn
from torch import autograd
from torch.nn import init
import math
from ops import mu_law_encode, time_to_batch, batch_to_time
from model2 import WaveNetModel
from audio_reader import AudioReader
import torch.optim as optim
import librosa
import time
import Queue
import json
FILE_PATTERN = r'([0-9]*)/audio([0-9]*)\.wav'

ImportError: cannot import name one_hot

In [8]:
with open('./wavenet_params.json', 'r') as config_file:
    wavenet_params = json.load(config_file)
data_directory = './training_set'
logdir_root = './logdir'
l2_reg = 0
silence_threshold = 0
epsilon = 10 ** np.random.uniform(0,4,20)
momentum = 0.9
checkpoint_every = 50
batch_size=wavenet_params["batch_size"]
validation_directory = './validation_set'

In [3]:
model = WaveNetModel(
                 wavenet_params["batch_size"],
                 wavenet_params["dilations"],
                 wavenet_params["filter_width"],
                 wavenet_params["residual_channels"],
                 wavenet_params["dilation_channels"],
                 wavenet_params["skip_channels"],
                 quantization_channels = wavenet_params["quantization_channels"],
                 use_biases = wavenet_params["use_biases"],
                 scalar_input = wavenet_params["scalar_input"],
                 use_cuda = wavenet_params["use_cuda"],
                 initial_filter_width = wavenet_params["initial_filter_width"]
                 )
optimizer = optim.SGD(model.parameters(), lr=epsilon, momentum = momentum, nesterov = True, weight_decay = l2_reg)
model.load_state_dict(torch.load('./epoch3680-loss=2.601_model.txt'))
for param in model.parameters():
    param.requires_grad = False
model.postprocess1 = nn.Conv1d(model.skip_channels, model.skip_channels, 1)
model.postprocess2 = nn.Conv1d(model.skip_channels, model.num_classes, 1)
if model.use_cuda:
    model = model.cuda()

In [4]:
reader = AudioReader(
        data_directory,
        sample_rate=wavenet_params["sample_rate"],
        receptive_field=model.receptive_field)   
start_time = time.time()
num_iters = reader.thread_main()
print (time.time() - start_time)
val_reader = AudioReader(
        validation_directory,
        sample_rate=wavenet_params["sample_rate"],
        receptive_field=model.receptive_field,
        load_size = 10)
start_time = time.time()
val_num_iters = val_reader.thread_main()
print (time.time() - start_time)
print (val_num_iters)

0.574819087982
1.21495509148
964


In [5]:
val_batch = torch.zeros((val_num_iters,model.receptive_field+1))
val_targets = torch.zeros(val_num_iters).type(torch.LongTensor)
for i in range(val_num_iters):
    val_batch[i,:] = torch.FloatTensor(val_reader.data_set.get_nowait())
    val_targets[i] = val_reader.target_queue.get_nowait().astype(np.int64)
if model.use_cuda:
    val_batch = autograd.Variable(val_batch).cuda()
    val_targets = autograd.Variable(val_targets).cuda()
else:
    val_batch = autograd.Variable(val_batch)
    val_targets = autograd.Variable(val_targets)

In [6]:
def set_model(wavenet_params, epsilon, l2_reg, File_name):
    model = WaveNetModel(
                 wavenet_params["batch_size"],
                 wavenet_params["dilations"],
                 wavenet_params["filter_width"],
                 wavenet_params["residual_channels"],
                 wavenet_params["dilation_channels"],
                 wavenet_params["skip_channels"],
                 quantization_channels = wavenet_params["quantization_channels"],
                 use_biases = wavenet_params["use_biases"],
                 scalar_input = wavenet_params["scalar_input"],
                 use_cuda = wavenet_params["use_cuda"],
                 initial_filter_width = wavenet_params["initial_filter_width"]
                 )
    optimizer = optim.SGD(model.parameters(), lr=epsilon, momentum = momentum, nesterov = True, weight_decay = l2_reg)
    model.load_state_dict(torch.load(File_name))
    for param in model.parameters():
        param.requires_grad = False
    model.postprocess1 = nn.Conv1d(model.skip_channels, model.skip_channels, 1)
    model.postprocess2 = nn.Conv1d(model.skip_channels, model.num_classes, 1)
    if model.use_cuda:
        model = model.cuda()
    return model, optimizer

In [9]:
epoch = 0
audio_batch = torch.zeros(batch_size,model.receptive_field+1)
targets_batch = torch.zeros(batch_size).type(torch.LongTensor)
accurancy_counter = torch.zeros(model.num_classes)

best_lr = 10
best_acc = 0

for j, lr in enumerate(epsilon):
    cur_time = time.time()
    model, optimizer = set_model(wavenet_params, lr, 0, './epoch220-loss=1.568_model.txt')
    for ep in range(200):
        num_iters = reader.thread_main()
        for iter in range(num_iters/batch_size):
            start_time = time.time()
            for i in range(batch_size):
                audio_part = reader.data_set.get_nowait()
                audio_batch[i,:] = torch.FloatTensor(audio_part)
                targets_batch[i] =  reader.target_queue.get_nowait().astype(np.int64)
            if model.use_cuda:
                audio = autograd.Variable(audio_batch.cuda())
                targets = autograd.Variable(targets_batch.cuda())
            else:
                audio = autograd.Variable(audio_batch)
                targets = autograd.Variable(targets_batch)
            model.zero_grad()
            raw_output = model(audio)
            loss = model._loss(raw_output, targets)
            loss.backward()
            optimizer.step()
    acc = model.accurancy(val_batch,val_targets)
    if acc > best_acc:
        best_acc = acc
        best_lr = lr
    ModelFile = './acc/accuracy:{:.5f} iteration:{:d} learning rate:{:.5f} loss:{:.3f}'.format(acc, j, lr, loss.data[0])
    print ('time:{:.1f}'.format(time.time() - cur_time ))
    print (ModelFile)
    torch.save(model.state_dict(), ModelFile)

time:171.8
./acc/accuracy:0.48859 iteration:0 learning rate:560.57378 loss:0.618
time:170.4
./acc/accuracy:0.49066 iteration:1 learning rate:23.57983 loss:0.771
time:170.1
./acc/accuracy:0.51349 iteration:2 learning rate:1.08125 loss:0.689
time:168.7
./acc/accuracy:0.50207 iteration:3 learning rate:2.23274 loss:1.237
time:169.4
./acc/accuracy:0.49378 iteration:4 learning rate:24.92113 loss:0.849
time:170.2
./acc/accuracy:0.50622 iteration:5 learning rate:4.91011 loss:0.758
time:171.0
./acc/accuracy:0.47925 iteration:6 learning rate:17.53136 loss:0.792
time:173.3
./acc/accuracy:0.49481 iteration:7 learning rate:439.88691 loss:0.844
time:176.6
./acc/accuracy:0.50104 iteration:8 learning rate:9.22684 loss:0.827
time:172.1
./acc/accuracy:0.50207 iteration:9 learning rate:746.76651 loss:1.201
time:173.7
./acc/accuracy:0.50207 iteration:10 learning rate:572.08271 loss:1.065


KeyboardInterrupt: 

In [None]:
epoch = 0
audio_batch = torch.zeros(batch_size,model.receptive_field+1)
targets_batch = torch.zeros(batch_size).type(torch.LongTensor)
accurancy_counter = torch.zeros(model.num_classes)

while 1:
    num_iters = reader.thread_main()
    for iter in range(num_iters/batch_size):
        start_time = time.time()
        for i in range(batch_size):
            audio_part = reader.data_set.get()
            audio_batch[i,:] = torch.FloatTensor(audio_part)
            targets_batch[i] =  reader.target_queue.get().astype(np.int64)
        if model.use_cuda:
            audio = autograd.Variable(audio_batch).cuda()
            targets = autograd.Variable(targets_batch).cuda()
        else:
            audio = autograd.Variable(audio_batch)
            targets = autograd.Variable(targets_batch) 
        model.zero_grad()

        raw_output = model(audio)
        loss = model._loss(raw_output, targets)
        loss.backward()
        optimizer.step()
        duration = time.time() - start_time
        if model.use_cuda:
            print('epoch {:d}, step {:d} - loss = {:.3f}, ({:.3f} sec/step)'
                  .format(epoch, iter, loss.cpu().data[0], duration))
            ModelFile = open('./logdir/step{:.3f}-loss={:.3f}_model.txt'.format(epoch, loss.data[0]), 'w')
    if (iter % checkpoint_every == 0):
        if model.use_cuda:
            print('epoch {:d}, step {:d} - loss = {:.3f}, ({:.3f} sec/step)'
                  .format(epoch, iter, loss.cpu().data[0], duration))
            ModelFile = open('./logdir/step{:.3f}-loss={:.3f}_model.txt'.format(epoch, loss.data[0]), 'w')
        else:
            print('epoch {:d}, step {:d} - loss = {:.3f}, ({:.3f} sec/step)'
                  .format(epoch, iter, loss.cpu().data[0], duration))
            ModelFile = open('./logdir/step{:.3f}-loss={:.3f}_model.txt'.format(epoch, loss.data[0]), 'w')                
        torch.save(model.state_dict(), ModelFile)
    epoch += 1

In [None]:
raw_text = open("./training_set/targets/1/audio200.txt").read()

In [None]:
print (reader.data_set.get().shape)

In [None]:
num_iters = reader.thread_main()

In [None]:
epoch = 0
audio_batch = torch.zeros(batch_size,model.receptive_field+1)
targets_batch = torch.zeros(batch_size).type(torch.LongTensor)
while 1:
    num_iters = reader.thread_main()
    for iter in range(num_iters/batch_size):
        start_time = time.time()
        for i in range(batch_size):
            print (reader.data_set.qsize())
            audio_part = reader.data_set.get()
            print (np.mean(audio_part))
            audio_batch[i,:] = torch.FloatTensor(reader.data_set.get())
            targets_batch[i] =  reader.target_queue.get().astype(np.int64)
            
            
            print (targets_batch[i])
        if model.use_cuda:
            audio = autograd.Variable(audio_batch).cuda()
            targets = autograd.Variable(targets_batch).cuda()
        else:
            audio = autograd.Variable(audio_batch)
            targets = autograd.Variable(targets_batch) 
        #model.zero_grad()
        encoded_input = mu_law_encode(audio,
                              model.quantization_channels)
        network_input = model.one_hot(encoded_input)

        # Cut off the last sample of network input to preserve causality.
        network_input_width = network_input.size()[2] - 1
        network_input = network_input[:, :, :network_input_width]
        
        
        raw_output = model._create_network(network_input)
        print (audio[2:17, :])
        # Cut off the samples corresponding to the receptive field
        # for the first predicted sample.
        prediction = torch.transpose(raw_output, 1, 2).view(-1, model.num_classes)
        print (prediction)
        prediction = F.softmax(prediction)
        
        qwer, tmp = torch.max(prediction, 0)
        duration = time.time() - start_time