# WaveRNN - Fit a 30min Sample

In [1]:
from google.colab import drive
drive.mount('/content/drive')

import os
os.chdir("drive/My Drive/asmr-is-all-you-need/network")


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [0]:
import time, sys, math
import numpy as np
import torch
from torch import optim
import torch.nn as nn
import torch.nn.functional as F
from tqdm import tqdm, notebook
from scipy.io import wavfile
from librosa import resample
from utils.display import *
from utils.dsp import *
from models.wavernn_extended import WaveRNN

In [0]:
files = [('../data/prova_glass.wav', 'glass'), ('../data/prova_silicon.wav', 'silicon')]
# files = [('../data/prova_silicon.wav', 'silicon')]

In [14]:
notebook_name = 'extended_v1'
sample_rate = 22050
epochs = 2
batch_size = 128

model = WaveRNN().cuda()

Trainable Parameters: 3.093 million


In [0]:
def split_signal(x) :
    unsigned = x + 2**15
    coarse = unsigned // 256
    fine = unsigned % 256
    return coarse, fine

In [0]:
def combine_signal(coarse, fine) :
    return coarse * 256 + fine - 2**15

In [0]:
def read_signal(file):
    sr, sample = wavfile.read(file)
    sample = sample.sum(axis=1) / 2
    sample = resample(sample, orig_sr = sr, target_sr = sample_rate)
    sample = sample.astype(np.int16)
    return sample

In [0]:
def preprocess_signal(signal, batch_size=batch_size):
    coarse_classes, fine_classes = split_signal(signal)
    coarse_classes = coarse_classes[:len(coarse_classes) // batch_size * batch_size]
    fine_classes = fine_classes[:len(fine_classes) // batch_size * batch_size]
    coarse_classes = np.reshape(coarse_classes, (batch_size, -1))
    fine_classes = np.reshape(fine_classes, (batch_size, -1))
    return coarse_classes, fine_classes

In [0]:
asmr2idx = dict()
idx2asmr = []
def asmr_processor(asmr_type):
    if asmr_type not in asmr2idx:
        idx2asmr.append(asmr_type)
        asmr2idx[asmr_type] = len(idx2asmr)
    return asmr2idx[asmr_type]

### Split/Combine Functions

In [0]:
def train(model, optimizer, asmr_number, embeddings, num_steps, batch_size, lr=1e-3, seq_len=960) :
    
    for p in optimizer.param_groups : p['lr'] = lr
    start = time.time()
    running_loss = 0
    
    for step in range(num_steps) :
        
        loss = 0
        hidden = model.init_hidden(batch_size)
        optimizer.zero_grad()
        rand_idx = np.random.randint(0, coarse_classes.shape[1] - seq_len - 1)
        
        x_coarse = coarse_classes[:, rand_idx:rand_idx + seq_len]
        x_coarse = torch.FloatTensor(x_coarse)
        x_coarse = x_coarse / 127.5 - 1.
        x_fine = fine_classes[:, rand_idx:rand_idx + seq_len]
        x_fine = torch.FloatTensor(x_fine)
        x_fine = x_fine / 127.5 - 1.
        
        y_coarse = coarse_classes[:, rand_idx + 1:rand_idx + seq_len + 1]
        y_coarse = torch.LongTensor(y_coarse)
        y_fine = fine_classes[:, rand_idx + 1: rand_idx + seq_len + 1]
        y_fine = torch.LongTensor(y_fine)

        embedding = embeddings(asmr_number)
        for i in range(seq_len) :
            
            x_c_in = x_coarse[:, i:i + 1]
            x_f_in = x_fine[:, i:i + 1]
            x_input = torch.cat([x_c_in, x_f_in], dim=1)
            x_input = x_input.cuda()
            
            c_target = y_coarse[:, i].cuda()
            f_target = y_fine[:, i].cuda()
            
            
            current_coarse = c_target.float() / 127.5 - 1.
            current_coarse = current_coarse.unsqueeze(-1)
            
            out_coarse, out_fine, hidden = model(embedding, x_input, hidden, current_coarse)
            
            loss_coarse = F.cross_entropy(out_coarse, c_target)
            loss_fine = F.cross_entropy(out_fine, f_target)
            loss += (loss_coarse + loss_fine)
        
        running_loss += (loss.item() / seq_len)
        loss.backward()
        optimizer.step()
        
        elapsed = time_since(start)
        speed = (step + 1) / (time.time() - start)
        
        stream('Step: %i/%i --- Loss: %.3f --- %s --- @ %.1f batches/sec ',
              (step + 1, num_steps, running_loss / (step + 1), elapsed, speed))         

In [0]:
optimizer = optim.Adam(model.parameters())

In [0]:
embeddings = nn.Embedding(num_embeddings=len(files)+1, embedding_dim=16, padding_idx=0).cuda()

In [0]:
for _ in notebook.tqdm(range(100)):
    for (file, asmr_type) in files:
        sample = read_signal(file)
        asmr_number = asmr_processor(asmr_type)
        coarse_classes, fine_classes = preprocess_signal(sample)
        
        asmr_number = torch.ones([batch_size], dtype=torch.long).cuda() * asmr_number

        train(model, optimizer, asmr_number, embeddings, num_steps=epochs, batch_size=batch_size, lr=1e-3)

HBox(children=(FloatProgress(value=0.0), HTML(value='')))

Step: 2/2 --- Loss: 3.836 --- 0m 4s --- @ 0.5 batches/sec 

In [0]:
torch.save(model.state_dict(), f'outputs/{notebook_name}/model_{notebook_name}.pt')

In [23]:
next(embeddings.parameters())

Parameter containing:
tensor([[ 0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,
          0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000],
        [-1.2488,  1.4049, -0.2708, -1.4467,  0.5609,  0.8664, -0.2327, -0.8576,
          0.1752,  0.4649,  1.0354,  0.2161, -0.8152, -0.4376,  0.5105, -0.0937],
        [-0.0528, -0.4788, -0.4407, -0.3140, -0.1513,  2.0285, -0.3473,  0.0342,
         -1.1886,  2.1028, -0.8277, -0.9387,  1.9280,  0.3267, -0.3563, -0.6319]],
       device='cuda:0', requires_grad=True)

In [24]:
emb = [i.detach() for i in next(embeddings.parameters())][1:]
emb

[tensor([-1.2488,  1.4049, -0.2708, -1.4467,  0.5609,  0.8664, -0.2327, -0.8576,
          0.1752,  0.4649,  1.0354,  0.2161, -0.8152, -0.4376,  0.5105, -0.0937],
        device='cuda:0'),
 tensor([-0.0528, -0.4788, -0.4407, -0.3140, -0.1513,  2.0285, -0.3473,  0.0342,
         -1.1886,  2.1028, -0.8277, -0.9387,  1.9280,  0.3267, -0.3563, -0.6319],
        device='cuda:0')]

In [25]:
asmr_types = set([0])
asmr_types = [emb[i] for i in asmr_types]
print(asmr_types)

num_samples = 100_000
output, c, f = model.generate(num_samples, asmr_types)

[tensor([-1.2488,  1.4049, -0.2708, -1.4467,  0.5609,  0.8664, -0.2327, -0.8576,
         0.1752,  0.4649,  1.0354,  0.2161, -0.8152, -0.4376,  0.5105, -0.0937],
       device='cuda:0')]
Gen: 100000/100000 -- Speed: 651

In [0]:
def save_wav(y, filename, sample_rate) :
    y = np.clip(y, -2**15, 2**15 - 1)
    wavfile.write(filename, sample_rate, y.astype(np.int16))

In [0]:
output_path = f'./outputs/{notebook_name}/gen_{notebook_name}1.wav'

In [0]:
save_wav(output, output_path, sample_rate)