# Import Libraries

In [3]:
import torch, os
import torchvision
import torch.nn as nn
import torch.nn.functional as F
from torchvision import datasets
from torchvision import transforms
from torchvision.datasets import ImageFolder
from torchvision.utils import save_image
from torchvision.utils import make_grid
from torch.autograd import Variable
from torch.utils.data import DataLoader
import matplotlib.pyplot as plt
from torch.utils import data as torchData

import sys
from os import listdir
from os.path import isfile, join
import numpy as np
import scipy as sp
import matplotlib as mpl
import matplotlib.pyplot as plt
from PIL import Image
import librosa
import librosa.display
import IPython.display as ipd

import random

%matplotlib inline

print(sys.version)
print('pytorch : ', torch.__version__)
print('librosa : ', librosa.__version__)
print('numpy : ', np.__version__)
print('scipy : ', sp.__version__)
print('matplotlib : ', mpl.__version__)

3.6.1 |Anaconda custom (64-bit)| (default, May 11 2017, 13:25:24) [MSC v.1900 64 bit (AMD64)]
pytorch :  0.4.0
librosa :  0.5.1
numpy :  1.12.1
scipy :  1.0.1
matplotlib :  2.0.2


In [4]:
'''
Data Format : Time / Material / Action / Reaction

* Time : Hit time. ex) 2.677438

* Material
wood, metal, dirt, rock, leaf, plastic, cloth, paper, water, grass(top 10)
drywall, plastic bag, gravel, ceramic, glass, carpet, tile, other

* Action
hit, scratch

* Reaction
static, scatter, rigid-motion, deform, splash
'''

# Condsider top 10 materials / hit, scartch / static, scatter



'\nData Format : Time / Material / Action / Reaction\n\n* Time : Hit time. ex) 2.677438\n\n* Material\nwood, metal, dirt, rock, leaf, plastic, cloth, paper, water, grass(top 10)\ndrywall, plastic bag, gravel, ceramic, glass, carpet, tile, other\n\n* Action\nhit, scratch\n\n* Reaction\nstatic, scatter, rigid-motion, deform, splash\n'

# Data Split

In [5]:
path_dir = "./data/"
output_dir = "./data_split/"

file_list = os.listdir(path_dir) 

wav_files = [f for f in file_list if f[-12:] == "denoised.wav"]
txt_files = [f for f in file_list if f[-4:] == ".txt"]

wav_files.sort()
txt_files.sort()

print("wav files")
print(wav_files)

print("txt files")
print(txt_files)

data_list_1 = []
data_list_2 = []
data_list_3 = []

for path in txt_files:
    print(path)
    f = open(path_dir+path,'r')
    data =f.read()
    
    data_list = data.split("\n")
    data_list = data_list [:-1]
    data_list = [data for data in data_list if data.find("None",9) == -1]
    
    print(data_list)
    
    temp_hit = [float(data.split(" ")[0]) for data in data_list]
    data_list_1.append(temp_hit)
    data_list_2.append([data.split(" ")[1] for data in data_list])
    data_list_3.append([data.split(" ")[2] for data in data_list])
    
    f.close()
    
#for 1 wav file
for idx1 in range(len(data_list_1)):
    
    y, sr = librosa.load(path_dir+wav_files[idx1],sr=16000)
    
    for idx2 in range(len(data_list_1[idx1])):
        # 시간 계산
        curr_time = data_list_1[idx1][idx2]
        output_path = output_dir + txt_files[idx1][:-9]+data_list_2[idx1][idx2]+"_"+data_list_3[idx1][idx2]+"_"+str(idx1)+"_"+str(idx2)+".wav"
        print(output_path)
        librosa.output.write_wav(output_path, y[int(sr*curr_time):int(sr*curr_time)+16384],sr)


wav files
['2015-02-16-16-49-06_denoised.wav', '2015-02-16-16-56-35_denoised.wav']
txt files
['2015-02-16-16-49-06_times.txt', '2015-02-16-16-56-35_times.txt']
2015-02-16-16-49-06_times.txt
['2.677438 ceramic hit static', '3.675198 ceramic hit static', '4.713354 ceramic hit static', '6.554396 ceramic hit static', '7.496979 ceramic hit rigid-motion', '8.427156 ceramic hit static', '9.382239 ceramic hit static', '10.410521 metal hit rigid-motion', '12.281136 ceramic hit rigid-motion', '13.104990 water hit splash', '14.151709 ceramic hit rigid-motion', '20.941792 ceramic hit static', '21.905563 ceramic hit static', '23.108355 ceramic scratch static', '24.642521 ceramic scratch static', '28.452520 metal hit static', '29.441355 metal hit static', '31.693365 metal scratch static', '41.459679 metal hit static', '44.055698 ceramic hit static', '44.897221 ceramic hit rigid-motion', '45.738251 ceramic scratch static', '47.434917 ceramic hit static', '48.176094 metal hit rigid-motion', '49.101509

# Data Load

In [6]:
path_dir = "./data_split/"

In [7]:

class AudioLoader(torchData.Dataset):
    
    def __init__(self, inPath, isShuffle = False):
        
        files = os.listdir(inPath)
        
        files = [f for f in files if f[-4:] == ".wav"]
        
#        files = [f for f in files if (f.find("wood",18) != -1 or f.find("metal",18) != -1)]
        files = [f for f in files if (f.find("ceramic",18) != -1 or f.find("metal",18) != -1)]
        
        if isShuffle:
            random.shuffle(files)
        
        self.inPath = inPath
        self.isShuffle = isShuffle
        self.len = len(files)  
        self.files = files
        print("# of wav files : ", self.len)
    
    def __getitem__(self, idx):
        
        y, sr = librosa.load(self.inPath+self.files[idx], 16000)
        y = torch.from_numpy(y)

        return y
    
    def __len__(self):
        return self.len
    
    
    
    

# Hparams Setting

In [8]:
lr = 0.0001
max_epoch = 20
batch_size = 64
z_dim = 100
image_size = 64
g_conv_dim = 64
d_conv_dim = 64
log_step = 100
sample_step = 500
sample_num = 32

# Generator

In [9]:
# Conv2d (Batch_num , Channel, length) 
def deconv(c_in, c_out, k_size, stride=2, pad=1, bn=True):
    """Custom deconvolutional layer for simplicity."""
    layers = []
    layers.append(nn.ConvTranspose2d(c_in, c_out, k_size, stride, pad))
    if bn:
        layers.append(nn.BatchNorm2d(c_out))
    return nn.Sequential(*layers)

#deconv1d
def deconv1d(c_in, c_out, k_size, stride=4, pad=11, out_pad = 1,bn=True):
    """Custom convolutional 1d lyaer for simplicity."""
    layers = []
    layers.append(nn.ConvTranspose1d(c_in, c_out, k_size, stride, pad,out_pad))
    if bn:
        layers.append(nn.BatchNorm1d(c_out))
    return nn.Sequential(*layers)

class WaveGAN_Generator(nn.Module):
    """Generator containing 7 deconvolutional layers."""
    def __init__(self, z_dim=100, image_size=128, conv_dim=g_conv_dim):
        super(WaveGAN_Generator, self).__init__()
        self.fc = nn.Linear(z_dim, 256*conv_dim)
#        self.deconv1 = deconv1d(conv_dim*16, conv_dim*8, (25,2,2), pad = 11)
        self.deconv1 = deconv1d(conv_dim*16, conv_dim*8, k_size = 25, pad = 11, out_pad = 1)
        self.deconv2 = deconv1d(conv_dim*8, conv_dim*4, 25)
        self.deconv3 = deconv1d(conv_dim*4, conv_dim*2, 25)
        self.deconv4 = deconv1d(conv_dim*2, conv_dim, 25)
        self.deconv5 = deconv1d(conv_dim, 1, 25, bn=False)
        
    def forward(self, z):
#        z = z.view(z.size(0), z.size(1))      # If image_size is 64, output shape is as below.
        out = self.fc(z)                 # (?, 256d)
        print(out.size())
        out = out.view(out.size(0),16*g_conv_dim,16 ) # (?,16,16d)
        print(out.size())
        out = F.relu(out)
        out = F.relu(self.deconv1(out))  # (?, 64, 8d)
        print("a")
        print(out.size())
        
        out = F.relu(self.deconv2(out))  # (?, 256, 4d)
        
        print(out.size())
        out = F.relu(self.deconv3(out))  # (?, 1024, 2d)
        print(out.size())
        out = F.relu(self.deconv4(out))  # (?, 4096, d)
        print(out.size())
        out = F.tanh(self.deconv5(out))  # (?, 16384, c)
        print(out.size())
        return out
    
G = WaveGAN_Generator(z_dim,image_size,g_conv_dim)
G

WaveGAN_Generator(
  (fc): Linear(in_features=100, out_features=16384, bias=True)
  (deconv1): Sequential(
    (0): ConvTranspose1d(1024, 512, kernel_size=(25,), stride=(4,), padding=(11,), output_padding=(1,))
    (1): BatchNorm1d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  )
  (deconv2): Sequential(
    (0): ConvTranspose1d(512, 256, kernel_size=(25,), stride=(4,), padding=(11,), output_padding=(1,))
    (1): BatchNorm1d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  )
  (deconv3): Sequential(
    (0): ConvTranspose1d(256, 128, kernel_size=(25,), stride=(4,), padding=(11,), output_padding=(1,))
    (1): BatchNorm1d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  )
  (deconv4): Sequential(
    (0): ConvTranspose1d(128, 64, kernel_size=(25,), stride=(4,), padding=(11,), output_padding=(1,))
    (1): BatchNorm1d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  )
  (deconv5): Sequential(
    (0

In [10]:
n_data = torch.ones(5,100)
#n_data

In [18]:
G = WaveGAN_Generator()

output_ = G.forward(n_data)
output_.size()[0]
output_.view(output_.size()[0],1,output_.size()[2])

torch.Size([5, 16384])
torch.Size([5, 1024, 16])
a
torch.Size([5, 512, 64])
torch.Size([5, 256, 256])
torch.Size([5, 128, 1024])
torch.Size([5, 64, 4096])
torch.Size([5, 1, 16384])


tensor([[[ 5.9021e-03, -5.9917e-02, -3.3245e-02,  ...,  9.6087e-03,
          -4.9979e-02, -2.4678e-02]],

        [[ 5.9021e-03, -5.9917e-02, -3.3245e-02,  ...,  9.6087e-03,
          -4.9979e-02, -2.4678e-02]],

        [[ 5.9021e-03, -5.9917e-02, -3.3245e-02,  ...,  9.6087e-03,
          -4.9979e-02, -2.4678e-02]],

        [[ 5.9021e-03, -5.9917e-02, -3.3245e-02,  ...,  9.6087e-03,
          -4.9979e-02, -2.4678e-02]],

        [[ 5.9021e-03, -5.9917e-02, -3.3245e-02,  ...,  9.6087e-03,
          -4.9979e-02, -2.4678e-02]]])

# Discriminator

In [12]:
def conv(c_in, c_out, k_size, stride=4, pad=1,bn=True):
    """Custom convolutional 1d lyaer for simplicity."""
    layers = []
    layers.append(nn.Conv2d(c_in, c_out, k_size,stride,pad))
    if bn:
        layers.append(nn.BatchNorm2d(c_out))
    return nn.Sequential(*layers)


def conv1d(c_in, c_out, k_size, stride=4, pad=11, out_pad = 1,bn=True):
    """Custom convolutional 1d lyaer for simplicity."""
    layers = []
    layers.append(nn.Conv1d(c_in, c_out, k_size,stride,pad))
    if bn:
        layers.append(nn.BatchNorm1d(c_out))
    return nn.Sequential(*layers)

def apply_phaseshuffle(x, n_phase, pad_type = 'refelct'):
    (batch, n_channel, x_len) = x.shape
    r = random.randrange(-n_phase, n_phase+1)
    pad_l = np.maximum( r , 0)
    pad_r = np.maximum(-r, 0)
    phase_start = pad_r
    
    padding = nn.ReflectionPad2d((pad_l, pad_r, 0, 0))
#    print("phase : ", r)
    #print("pad_l, pad_r, phase_start, x_len", pad_l, pad_r, phase_start, x_len)
    #print("x.shape", x.shape)
    
    for x_ in x:

        ch_, len_ = x_.shape
        x_ = x_.reshape(1,1,ch_,len_)
        x_ = padding(x_)
        x_ = x_[:, :, :,phase_start:phase_start + len_]
        x_ = x_.reshape(ch_,len_)
    
    return x
    

#Ref DCGAN : https://github.com/InsuJeon/Hello-Generative-Model/blob/master/Day04/DCGAN/dcgan.ipynb
class WaveGAN_Discriminator(nn.Module):
    """Discriminator containing 4 convolutional layers."""
    

    def __init__(self,  conv_dim=d_conv_dim, n_phase = 2):
        
        if n_phase > 0:
            self.phaseshuffle = lambda x: apply_phaseshuffle(x,n_phase)
        else:
            self.phaseshuffle = lambda x: x
        
        super(WaveGAN_Discriminator, self).__init__()
        self.conv1 = conv1d(1, conv_dim, 25, bn=False)
        self.conv2 = conv1d(conv_dim, conv_dim*2, 25)
        self.conv3 = conv1d(conv_dim*2, conv_dim*4, 25)
        self.conv4 = conv1d(conv_dim*4, conv_dim*8, 25)
        self.conv5 = conv1d(conv_dim*8, conv_dim*16, 25)
        self.fc = nn.Linear(conv_dim*16*16,1)
        
#            conv(conv_dim*8, 1, int(image_size/16), 1, 0, False)
        
    def forward(self, x):
        # (?, 1, 16384) -> (?, 64, 4096)
        out = F.leaky_relu(self.conv1(x), 0.2)    # (?, 64, 32, 32)
        out = self.phaseshuffle(out)
        
        # (?, 64, 4096) -> (?, 128, 1024)
        out = F.leaky_relu(self.conv2(out), 0.2)  # (?, 128, 16, 16)
        out = self.phaseshuffle(out)

        # (?, 128, 1024) -> (?, 256, 256)
        out = F.leaky_relu(self.conv3(out), 0.2)  # (?, 256, 8, 8)
        out = self.phaseshuffle(out)

        # (?, 256, 256) -> (?, 512, 64)
        out = F.leaky_relu(self.conv4(out), 0.2)  # (?, 512, 4, 4)
        out = self.phaseshuffle(out)

        # (?, 512, 64) -> (?, 1024, 16)
        out = F.leaky_relu(self.conv5(out), 0.2)  # (?, 512, 4, 4)
        out = self.phaseshuffle(out)

        # (?, 1024, 16) -> (?, 16384)
        # (?,16384) -> (?,1)
        out = out.view(out.size(0), 256 * d_conv_dim)
        out = F.sigmoid(self.fc(out))
        out = out.squeeze()
        
        return out

In [13]:
D = WaveGAN_Discriminator(n_phase = 2)

In [16]:
print(output.size)
D.forward(output)

<built-in method size of Tensor object at 0x000001FCF4280C60>


tensor([ 0.4828,  0.4828,  0.4828,  0.4828,  0.4828])

In [103]:
print("path_dir : ", path_dir)
dataset = AudioLoader(path_dir)

trainLoader = torchData.DataLoader(
    dataset = dataset,
    batch_size = 3,
    shuffle = False
)

criterion_vanillia = nn.BCELoss()
d_optimizer = torch.optim.Adam(D.parameters(), lr=lr, betas=(0.5,0.9))
g_optimizer = torch.optim.Adam(G.parameters(), lr=lr, betas=(0.5,0.9))


for epoch in range(10):
    
    for idx, data in enumerate(trainLoader):
        #print(data)

path_dir :  ./data_split/
# of wav files :  38
tensor([[-0.0582,  0.0627,  0.0343,  ..., -0.0002, -0.0002, -0.0002],
        [ 0.0146,  0.0841, -0.2565,  ...,  0.0249,  0.0085, -0.0068],
        [-0.1053,  0.3532, -0.4445,  ..., -0.0004, -0.0003, -0.0002]])
tensor([[-0.2800,  0.4602, -0.0682,  ..., -0.7203, -0.2984,  0.1931],
        [ 0.0158, -0.2845,  0.6770,  ...,  0.0211,  0.0029,  0.0449],
        [-0.2772, -0.1096,  0.1559,  ...,  0.0000, -0.0000,  0.0000]])
tensor([[ 0.0702, -0.5358,  0.3916,  ..., -0.0011, -0.0004,  0.0015],
        [ 0.0638, -0.0955, -0.0287,  ...,  0.0033,  0.0052, -0.0081],
        [-0.1652, -0.0455,  0.5964,  ..., -0.0002, -0.0003, -0.0001]])
tensor([[ 0.0049, -0.0666,  0.0408,  ..., -0.0014,  0.0000,  0.0007],
        [-0.1692,  0.1627,  0.1360,  ...,  0.1122,  0.0911, -0.0940],
        [-0.0904, -0.0285,  0.2533,  ...,  0.0005,  0.0026, -0.0013]])
tensor([[ 0.0717, -0.5103,  0.3428,  ...,  0.0003, -0.0015, -0.0028],
        [-0.1422,  0.0434,  0.7737,  ..

        [-0.1053,  0.3532, -0.4445,  ..., -0.0004, -0.0003, -0.0002]])
tensor([[-0.2800,  0.4602, -0.0682,  ..., -0.7203, -0.2984,  0.1931],
        [ 0.0158, -0.2845,  0.6770,  ...,  0.0211,  0.0029,  0.0449],
        [-0.2772, -0.1096,  0.1559,  ...,  0.0000, -0.0000,  0.0000]])
tensor([[ 0.0702, -0.5358,  0.3916,  ..., -0.0011, -0.0004,  0.0015],
        [ 0.0638, -0.0955, -0.0287,  ...,  0.0033,  0.0052, -0.0081],
        [-0.1652, -0.0455,  0.5964,  ..., -0.0002, -0.0003, -0.0001]])
tensor([[ 0.0049, -0.0666,  0.0408,  ..., -0.0014,  0.0000,  0.0007],
        [-0.1692,  0.1627,  0.1360,  ...,  0.1122,  0.0911, -0.0940],
        [-0.0904, -0.0285,  0.2533,  ...,  0.0005,  0.0026, -0.0013]])
tensor([[ 0.0717, -0.5103,  0.3428,  ...,  0.0003, -0.0015, -0.0028],
        [-0.1422,  0.0434,  0.7737,  ...,  0.0006,  0.0002,  0.0007],
        [-0.1921,  0.3932, -0.3010,  ...,  0.0074,  0.0038, -0.0114]])
tensor([[-0.3336,  0.1196, -0.2098,  ...,  0.0000,  0.0000,  0.0000],
        [-0.124

        [-0.2772, -0.1096,  0.1559,  ...,  0.0000, -0.0000,  0.0000]])
tensor([[ 0.0702, -0.5358,  0.3916,  ..., -0.0011, -0.0004,  0.0015],
        [ 0.0638, -0.0955, -0.0287,  ...,  0.0033,  0.0052, -0.0081],
        [-0.1652, -0.0455,  0.5964,  ..., -0.0002, -0.0003, -0.0001]])
tensor([[ 0.0049, -0.0666,  0.0408,  ..., -0.0014,  0.0000,  0.0007],
        [-0.1692,  0.1627,  0.1360,  ...,  0.1122,  0.0911, -0.0940],
        [-0.0904, -0.0285,  0.2533,  ...,  0.0005,  0.0026, -0.0013]])
tensor([[ 0.0717, -0.5103,  0.3428,  ...,  0.0003, -0.0015, -0.0028],
        [-0.1422,  0.0434,  0.7737,  ...,  0.0006,  0.0002,  0.0007],
        [-0.1921,  0.3932, -0.3010,  ...,  0.0074,  0.0038, -0.0114]])
tensor([[-0.3336,  0.1196, -0.2098,  ...,  0.0000,  0.0000,  0.0000],
        [-0.1248,  0.1481, -0.0112,  ..., -0.0000, -0.0002, -0.0001],
        [-0.0023, -0.0301,  0.0789,  ..., -0.0045,  0.0023,  0.0035]])
tensor([[-0.0582,  0.0627,  0.0343,  ..., -0.0002, -0.0002, -0.0002],
        [-0.216

        [-0.1652, -0.0455,  0.5964,  ..., -0.0002, -0.0003, -0.0001]])
tensor([[ 0.0049, -0.0666,  0.0408,  ..., -0.0014,  0.0000,  0.0007],
        [-0.1692,  0.1627,  0.1360,  ...,  0.1122,  0.0911, -0.0940],
        [-0.0904, -0.0285,  0.2533,  ...,  0.0005,  0.0026, -0.0013]])
tensor([[ 0.0717, -0.5103,  0.3428,  ...,  0.0003, -0.0015, -0.0028],
        [-0.1422,  0.0434,  0.7737,  ...,  0.0006,  0.0002,  0.0007],
        [-0.1921,  0.3932, -0.3010,  ...,  0.0074,  0.0038, -0.0114]])
tensor([[-0.3336,  0.1196, -0.2098,  ...,  0.0000,  0.0000,  0.0000],
        [-0.1248,  0.1481, -0.0112,  ..., -0.0000, -0.0002, -0.0001],
        [-0.0023, -0.0301,  0.0789,  ..., -0.0045,  0.0023,  0.0035]])
tensor([[-0.0582,  0.0627,  0.0343,  ..., -0.0002, -0.0002, -0.0002],
        [-0.2166,  0.2974, -0.1138,  ...,  0.0098, -0.0104, -0.0006],
        [-0.2386,  0.1556, -0.1297,  ...,  0.0182,  0.1056, -0.0516]])
tensor([[ 0.0459,  0.0499, -0.1558,  ..., -0.0026, -0.0008,  0.0000],
        [-0.340