In [72]:
# !pip install librosa # in colab, you’ll need to install this
import librosa
import numpy as np
import torchvision
import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader
from torchvision import datasets
from torchvision.transforms import ToTensor
import torch.nn.functional as F
import pandas as pd
from random import sample
import matplotlib.pyplot as plt
import soundfile

In [2]:
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using {device} device")

Using cpu device


Why use tensor.from_numpy than torch.tensor: <br />
https://stackoverflow.com/questions/48482787/pytorch-memory-model-torch-from-numpy-vs-torch-tensor

In [300]:
class AudioNN(nn.Module):
    def __init__(self, parameters):
        super(AudioNN, self).__init__()
        
        in_dim = parameters['in_dim']
        out_dim = parameters['out_dim']
        
        self.type_init = parameters['type_init']
        self.activation = parameters['activation']
        self.dropout = parameters['dropout']
        
        self.l1 = nn.Linear(in_dim, out_dim)
        self.l2 = nn.Linear(out_dim, out_dim)
        self.l3 = nn.Linear(out_dim, in_dim)
        
        if self.type_init == 'xavier':
            for key, layer in self.named_children():
                nn.init.xavier_uniform_(layer.weight)


    def forward(self, x):
        for key, layer in self.named_children():
            if self.activation == 'sigmoid':
                x = F.sigmoid(layer(x))
            if self.activation == 'ReLU':
                x = F.relu(layer(x))

        return x

def read_file(file_path):
    s, sr = librosa.load(file_path, sr=None)
    S = librosa.stft(s, n_fft=1024, hop_length=512)
    S_abs = np.abs(S)
    
    return torch.from_numpy(S.T), torch.from_numpy(S_abs.T), sr
    
class PrepareData(Dataset):
    def __init__(self, X, y, options):
        super(PrepareData, self).__init__()
        self.X = X
        self.y = y
        
        if options == 'conv1d':
            self.X = torch.unsqueeze(X, dim=1)
        
    def __len__(self):
        return self.X.shape[0]

    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]
    
    
class Prepare2DImageData(Dataset):
    def __init__(self, X, y, x_dim=20):
        super(Prepare2DImageData, self).__init__()
        x_new, y_new = self.reshape_data(X, y, x_dim)
        self.X = x_new
        self.y = y
        self.x_dim = x_dim

    def reshape_data(self, X, y=None, x_dim=20):
        i = 0
        new_tensor = []
        new_y_tensor = []
        
        for i in np.arange(X.shape[0]):
            x_new = X[i: i + x_dim]
            
            if y is not None:
                y_new = y[i: i + x_dim]
                      
            if x_new.shape[0] < x_dim:
                diff = x_dim - x_new.shape[0]
                x_padding = torch.zeros(
                    diff, x_new.shape[1]) + 1e5
                
                x_new = torch.cat((x_new, x_padding))
                
                if y is not None:
                    y_padding = torch.zeros(
                        diff, x_new.shape[1]) + 1e5
                    y_new = torch.cat((y_new, y_padding))
            
            x_new = torch.unsqueeze(x_new, dim=0)
            
            new_tensor.append(x_new)
            
            if y is not None:
                y_new = torch.unsqueeze(y_new, dim=0)
                
                new_y_tensor.append(y_new)
        
        new_tensor = torch.stack(new_tensor)
        
        if y is not None:
            new_y_tensor = torch.stack(new_y_tensor)
            
        return new_tensor, new_y_tensor
        
    def __len__(self):
        return self.X.shape[0]

    def __getitem__(self, idx):
        y = 0
        if self.y is not None:
            y = self.y[idx]
            
        return self.X[idx], y
    
def create_train_val(train_data, label_data, 
                     percentage=0.2, batchsize=64,
                     x_dim=20, options=None
                    ):
    index = np.arange(train_data.shape[0])
    split = int(train_data.shape[0] * percentage)
    
    np.random.shuffle(index)
    
    if options == 'conv2d':
        val_dataset = Prepare2DImageData(train_data[index[:split]], 
                                         label_data[index[:split]],
                                         x_dim=x_dim
                                        )
                                         
        train_dataset = Prepare2DImageData(train_data[index[split:]], 
                                label_data[index[split:]],
                                x_dim=x_dim
                               )
    else: 
        val_dataset = PrepareData(train_data[index[:split]], 
                                  label_data[index[:split]],
                                  options
                                 )
        train_dataset = PrepareData(train_data[index[split:]], 
                                    label_data[index[split:]],
                                    options
                                   )
    val_dataloader = DataLoader(dataset=val_dataset, 
                            shuffle=False, 
                            batch_size=batchsize)
    train_dataloader = DataLoader(dataset=train_dataset, 
                              shuffle=False, 
                              batch_size=batchsize)
        
    
    return train_dataloader, val_dataloader

def loss_snr(dataloader, loss_fn, model):
    total_loss = 0
    model.eval()
    
    with torch.no_grad():
        sum_sq = torch.tensor(0.)
        diff_sum = torch.tensor(0.)
        total_loss = 0
        
        for X, y in dataloader:
            X, y = X.to(device), y.to(device)
            sum_sq = sum_sq + (y * y).sum()
            pred = model(X).to(device)
            loss = loss_fn(pred, y).item()
            
            total_loss = total_loss + loss
            diff = torch.square(y - pred).sum()
            diff_sum = diff_sum + diff
    
    total_loss = total_loss / len(dataloader)
    
    #adding 10**20 gave overflow when unpacking
    snr = (10 * torch.log10((sum_sq / diff_sum))).item()
           
    return total_loss, snr
    
def train(model, dataloader, val_dataloader, 
          loss_fn, optimizer):
    model.train()
    
    for batch, (X, y) in enumerate(dataloader):
        X, y = X.to(device), y.to(device)

        
        # Compute prediction error
        pred = model(X)
        
#         print('X :', X.shape)
#         print('y :', y.shape)
#         print('pred :', pred.shape)

        loss = loss_fn(pred, y)
        
        # Backpropagation
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    loss_train, snr_train = loss_snr(dataloader, loss_fn, model)
    loss_val, snr_val = loss_snr(val_dataloader, loss_fn, model)
 
    return [loss_train, snr_train, loss_val, snr_val]

def reconstruct(X, X_abs, S_abs, sr, filename):
    S_recons = (X.numpy() / X_abs.numpy()) * S_abs.numpy()
    S_recons_sound = librosa.istft(S_recons, 
                                   hop_length=512, )
    soundfile.write(filename, S_recons_sound, sr)
           
def process(parameters, option = None):
    X, X_abs, sr_x = read_file(parameters['train_file'])
    S, S_abs, sr_s = read_file(parameters['label_file'])
    
    if option == None:
        train_dataloader, val_dataloader = create_train_val(
            X_abs, S_abs, 0.2, parameters['batchsize'])

        model = AudioNN(parameters).to(device)
        filename = 'hw2_qns1_loss.pkl.gz'
        
    if option == 'conv1d':
        train_dataloader, val_dataloader = create_train_val(
        X_abs, S_abs, 0.2, parameters['batchsize'], 
            x_dim=20, options='conv1d')

        model = Audio1DCNN(parameters)
        filename = 'hw2_qns2_loss.pkl.gz'
        
    if option == 'conv2d':
        train_dataloader, val_dataloader = create_train_val(
        X_abs, S_abs, 0.2, parameters['batchsize'], 
            x_dim=20, options='conv2d')

        model = Audio2DCNN(parameters['conv2d'])
        filename = 'hw2_qns3_loss.pkl.gz'
        
        print(model)
       
    model.train()
    loss_fn = nn.MSELoss()
    optimizer_adam = torch.optim.Adam(model.parameters(), 
                                      lr=parameters['lr_rate'])
           
    #train_loss, train_snr, val_loss, val_snr
    all_loss = []
    for t in range(parameters['epoch']):
        loss = train(model, train_dataloader, 
                     val_dataloader, loss_fn, 
                     optimizer_adam)
       
        all_loss.append(loss)
       
        if t % 1 == 0:
            print('Epoch :', t)
            print('Train loss : ', loss[0])
            print('Train snr : ', loss[1])
            print('Val loss : ', loss[2])
            print('Val snr : ', loss[3])
    
    (pd.DataFrame(data=all_loss, 
                  columns=['train_loss', 'train_snr', 'val_loss', 'val_snr'])
     .to_pickle(filename)
    )
        
    torch.save(model.state_dict(), f'./hw_2_{option}.model')
           
           
    for test in parameters['test']:
        T, T_abs, sr_t = read_file(test)
        
        if option == 'conv2d':
            T_re = Prepare2DImageData(
                T_abs, None, x_dim=20)
            
            T_dataloader = DataLoader(dataset=T_re, 
                            shuffle=False, 
                            batch_size=parameters['batchsize'])
            
            all_test = []
            for x, y in T_dataloader:
                pred_T = model(x)
                
                all_test.append(pred_T)
                
            pred_t = torch.concat(all_test)
            pred_t_abs = np.abs(pred_t.detach())
        elif option == 'conv1d':
            T_unsq = torch.unsqueeze(T_abs, dim=1)
            pred_t = model(T_unsq)
        elif option == None:
            pred_t = model(T_abs)
            
        pred_t_abs = np.abs(pred_t.detach())
            
        filename = (test.split('/')[-1]).split('.')[0]+ f'_{option}_' + '_pred.wav'
           
        reconstruct(T.T, T_abs.T, pred_t_abs.T, 
                    sr_t, filename)



class Audio1DCNN(nn.Module):
    def __init__(self, parameters):
        super(Audio1DCNN, self).__init__()
        
        in_dim = parameters['conv1d_in_dim']
        out_dim = parameters['conv1d_out_dim']
        
        self.type_init = parameters['type_init']
        self.activation = parameters['activation']
        self.dropout = parameters['dropout']
        
        self.conv1 = nn.Conv1d(in_channels=parameters['in_channel'],
                               out_channels=parameters['out_channel'],
                               kernel_size=parameters['kernel_size'],
                               stride=parameters['stride'],
                               )
        self.max1 = nn.MaxPool1d(kernel_size=parameters['pool_kernel_size'],
                                 stride=parameters['pool_stride']
                                )
        
        self.conv2 = nn.Conv1d(in_channels=parameters['in_channel'],
                               out_channels=parameters['out_channel'],
                               kernel_size=parameters['2nd_kernel_size'],
                               stride=parameters['stride'])
        self.max2 = nn.MaxPool1d(kernel_size=parameters['pool_kernel_size'],
                                 stride=parameters['pool_stride'])
        self.flatten = nn.Flatten()
        self.l3 = nn.Linear(in_dim, out_dim)
        
        if self.type_init == 'xavier':
            for key, layer in self.named_children():
                if key in ['max1', 'max2', 'flatten']:
                    continue
                    
                nn.init.xavier_uniform_(layer.weight)


    def forward(self, x):
        for key, layer in self.named_children():
            if layer in ['conv1', 'conv2', 'flatten']:
                continue
            if self.activation == 'sigmoid':
                x = F.sigmoid(layer(x))
            if self.activation == 'ReLU':
                x = F.relu(layer(x))

        return x

    
    

class Audio2DCNN(nn.Module):
    def __init__(self, parameters):
        super(Audio2DCNN, self).__init__()
        
        in_dim = parameters['in_dim']
        out_dim = parameters['out_dim']
        
        self.type_init = parameters['type_init']
        self.activation = parameters['activation']
        self.dropout = parameters['dropout']
        
        self.conv1 = nn.Conv2d(in_channels=parameters['in_channel'],
                               out_channels=parameters['out_channel'],
                               kernel_size=parameters['kernel_size'],
                               stride=parameters['stride'],
                               )
        self.max1 = nn.MaxPool2d(kernel_size=parameters['pool_kernel_size'],
                                 stride=parameters['pool_stride']
                                )
        
        self.conv2 = nn.Conv1d(in_channels=parameters['2nd_in_channel'],
                               out_channels=parameters['2nd_out_channel'],
                               kernel_size=parameters['2nd_kernel_size'],
                               stride=parameters['stride'])
#         self.max2 = nn.MaxPool1d(kernel_size=parameters['pool_kernel_size'],
#                                  stride=parameters['pool_stride'])
        self.flatten = nn.Flatten()
        self.l3 = nn.Linear(in_dim, out_dim)
        
        if self.type_init == 'xavier':
            for key, layer in self.named_children():
                if key in ['max1', 'max2', 'flatten']:
                    continue
                    
                nn.init.xavier_uniform_(layer.weight)


    def forward(self, x):
        for key, layer in self.named_children():
            if layer in ['conv1', 'conv2', 'flatten']:
                continue
            if self.activation == 'sigmoid':
                printk(key)
                x = F.sigmoid(layer(x))
            if self.activation == 'ReLU':
                x = F.relu(layer(x))

        return x
           
       
           
        
parameters = {
    "in_dim": 513,
    "out_dim": 1024,
    "conv1d_in_dim": 118,
    "conv1d_out_dim": 513,
    "batchsize": 64,
    "dropout": None,
    "activation": "ReLU",
    "type_init": "xavier",
    "lr_rate": 0.01,
    "epoch": 300,
    "in_channel": 1,
    "out_channel": 1,
    "kernel_size":10,
    "conv2d":{
        "in_channel": 1,
        "out_channel": 3,
        "kernel_size":(2, 2),
        "stride": 3,
        "activation": "ReLU",
        "type_init": "xavier",
        "pool_kernel_size": (2,2),
        "pool_stride": 3,
        "in_dim": 95,
        "out_dim": 513,
        "dropout": None,
        "2nd_kernel_size": (2,2),
        "2nd_in_channel": 3,
        "2nd_out_channel":5
    },
        
    "stride": 2,
    "2d_stride": 2,
    "pool_kernel_size": 5,
    "pool_stride": 1,
    "train_file": "./data/train_dirty_male.wav",
    "label_file": "./data/train_clean_male.wav",
    "test": ["./data/test_x_01.wav", "./data/test_x_02.wav"],
}
    
    
# process(parameters)
# process(parameters, 'conv1d')
# process(parameters, 'conv2d')

### snr scores

In [302]:
import pandas as pd

print('\n\n---- Qns 1 -------------------- \n')

df = pd.read_pickle('hw2_qns1_loss.pkl.gz')
df = df.sort_values(by=['val_snr'], ascending =False)
print(df.head())


print('\n\n---- Qns 2 ------------------ \n')

df = pd.read_pickle('hw2_qns2_loss.pkl.gz')
df = df.sort_values(by=['val_snr'], ascending =False)
print(df.head())


print('\n\n---- Qns 3 ---------------- \n')

df = pd.read_pickle('hw2_qns3_loss.pkl.gz')
df = df.sort_values(by=['val_snr'], ascending =False)
print(df.head())



---- Qns 1 -------------------- 

     train_loss  train_snr  val_loss   val_snr
79     0.017225   7.550718  0.024621  6.216075
87     0.017206   7.552752  0.024682  6.210452
47     0.017607   7.453322  0.024803  6.169569
54     0.017424   7.498651  0.024947  6.151829
172    0.015087   8.126129  0.025013  6.144553


---- Qns 2 ------------------ 

    train_loss  train_snr  val_loss   val_snr
18    0.035278   4.629365  0.035165  3.862931
22    0.034530   4.723810  0.035302  3.846718
20    0.034951   4.669848  0.035331  3.843074
17    0.035689   4.577298  0.035363  3.836814
26    0.034135   4.775268  0.035395  3.836028


---- Qns 3 ---------------- 

     train_loss  train_snr  val_loss   val_snr
2      0.071011   1.121661  0.104187  0.834148
236    0.067931   1.313225  0.106844  0.714055
231    0.068001   1.308765  0.106848  0.713908
235    0.067946   1.312300  0.106852  0.713704
238    0.067908   1.314710  0.106853  0.713675


# Results:

In [277]:
import IPython

In [303]:
#Qns 1

IPython.display.Audio("test_x_01_pred.wav")

In [304]:
IPython.display.Audio("test_x_02_pred.wav")

In [305]:
#Qns 2

IPython.display.Audio("test_x_01_conv1d__pred.wav")

In [306]:
IPython.display.Audio("test_x_02_conv1d__pred.wav")

In [307]:
#Qns 3

IPython.display.Audio("test_x_01_conv2d__pred.wav")

In [308]:
IPython.display.Audio("test_x_02_conv2d__pred.wav")