In [51]:
import torchaudio
import torch

In [52]:
# load file at data/vcc2016_training/SF1/100001.wav
waveform, sample_rate = torchaudio.load('../data/vcc2016_training/SF1/100001.wav')
waveform.size()

torch.Size([1, 56314])

# TODO:
- [x] Just try to implement the architecture of CycleGAN Generator and Discriminator
- [x] Write model with Gx->y, Gy->x and Dy submodules
- [] Reproduce inverse-forward and forward-inverse passes
- Write the shell of training loop on random data
- Create variables for outputs and implement loss function
- Add backprop step
- Zero gradients
- Add optimizer
- Once that works, train the network with SGD
  - This might require actually implementing the dataset

In [150]:
class Downsample(torch.nn.Module):
   
    def __init__(self, in_channels, out_channels, kernel_size, stride, padding=(0, 0)):
      super(Downsample, self).__init__()
      self.in_channels = in_channels
      self.out_channels = out_channels
      self.conv = torch.nn.Conv2d(in_channels=in_channels,
                                  out_channels=out_channels,
                                  kernel_size=kernel_size,
                                  stride=stride,
                                  padding=padding)
      self.norm = torch.nn.InstanceNorm2d(num_features=out_channels, affine=True)
      self.glu = torch.nn.GLU(dim=1)
    
    def forward(self, x):
      # from IPython.core.debugger import set_trace;set_trace()
      x = self.conv(x)
      x = self.norm(x)
      x = self.glu(x)
      return x
  
class ResidualBlock(torch.nn.Module):
  def __init__(self, in_channels1, out_channels1, in_channels2, out_channels2, kernel_size, stride):
    super(ResidualBlock, self).__init__()
    self.conv1 = torch.nn.Conv2d(in_channels=in_channels1,
                                out_channels=out_channels1,
                                kernel_size=kernel_size,
                                stride=stride,
                                padding='same')
    self.norm1 = torch.nn.InstanceNorm2d(num_features=out_channels1, affine=True)
    self.glu = torch.nn.GLU(dim=1)
    self.conv2 = torch.nn.Conv2d(in_channels=in_channels2,
                                out_channels=out_channels2,
                                kernel_size=kernel_size,
                                stride=stride,
                                padding='same')
    self.norm2 = torch.nn.InstanceNorm2d(num_features=out_channels2, affine=True)
  
  def forward(self, x):
    residual = x.clone()
    x = self.conv1(x)
    x = self.norm1(x)
    x = self.glu(x)
    x = self.conv2(x)
    x = self.norm2(x)
    return x + residual

class Upsample(torch.nn.Module):
    
    def __init__(self, in_channels, out_channels, kernel_size, stride):
      super(Upsample, self).__init__()
      self.conv = torch.nn.Conv2d(in_channels=in_channels,
                                          out_channels=out_channels,
                                          kernel_size=kernel_size,
                                          stride=stride)
      self.pixel_shuffle = torch.nn.PixelShuffle(upscale_factor=2)
      self.norm = torch.nn.InstanceNorm2d(num_features=out_channels//4, affine=True)
      self.glu = torch.nn.GLU(dim=1)
    
    def forward(self, x):
      x = self.conv(x)
      x = self.pixel_shuffle(x)
      x = self.norm(x)
      x = self.glu(x)
      return x

class Generator(torch.nn.Module):
    
    def __init__(self):
      super(Generator, self).__init__()
      self.conv1 = torch.nn.Conv2d(in_channels=24, out_channels=128, kernel_size=(1, 5), stride=(1, 2))
      self.glu = torch.nn.GLU(dim=1)
      self.downsample_twice = torch.nn.Sequential(
        Downsample(in_channels=64, out_channels=256, kernel_size=(1, 5), stride=(1, 2)),
        Downsample(in_channels=128, out_channels=512*2, kernel_size=(1, 5), stride=(1, 2))
      )
      self.residual_blocks = torch.nn.Sequential(
        *[ResidualBlock(in_channels1=512, out_channels1=1024,
                        in_channels2=512, out_channels2=512,
                        kernel_size=(1, 3), stride=(1, 1)) for _ in range(6)]
      )
      self.upsample_twice = torch.nn.Sequential(
         Upsample(in_channels=512, out_channels=1024, kernel_size=(1, 5), stride=(1, 1)),
         Upsample(in_channels=128, out_channels=512, kernel_size=(1, 5), stride=(1, 1)),
      )
      self.conv2 = torch.nn.Conv2d(in_channels=64, out_channels=24, kernel_size=(1, 15), stride=(1, 1))

    def forward(self, x):
      x = self.conv1(x)
      x = self.glu(x)
      x = self.downsample_twice(x)
      x = self.residual_blocks(x)
      x = self.upsample_twice(x)
      x = self.conv2(x)
      return x

class Discriminator(torch.nn.Module):
   
  def __init__(self):
    super(Discriminator, self).__init__()
    self.conv1 = torch.nn.Conv2d(in_channels=1, out_channels=128, kernel_size=(3, 3), stride=(1, 2), padding=(1,1))
    self.glu = torch.nn.GLU(dim=1)
    
    self.d1 = Downsample(in_channels=64, out_channels=256, kernel_size=(3, 3), stride=(2, 2), padding=(1,1))
    self.d2 = Downsample(in_channels=128, out_channels=512, kernel_size=(3, 3), stride=(2, 2), padding=(1,1))
    self.d3 = Downsample(in_channels=256, out_channels=1024, kernel_size=(6, 3), stride=(1, 2), padding=(0,0))
    
    self.fc = torch.nn.Linear(in_features=3584, out_features=1)
    self.sigmoid = torch.nn.Sigmoid()     

  def forward(self, x):
    x = self.conv1(x)
    x = self.glu(x)
    x = self.d1(x)
    x = self.d2(x)
    x = self.d3(x)
    x = x.flatten(start_dim=1)
    x = self.fc(x)
    x = self.sigmoid(x)
    return x

In [152]:
def test_residual_block():
  residual = ResidualBlock(in_channels1=1024, out_channels1=1024, 
              in_channels2=512, out_channels2=1024,
              kernel_size=(1, 3), stride=(1, 1))

  residual.forward(torch.randn(1, 1024, 1, 1024)).size()
test_residual_block()

def test_downsample_block():
  downsample = Downsample(in_channels=24, out_channels=256, kernel_size=(1, 5), stride=(1, 2))
  print(downsample.forward(torch.randn(1, 24, 1, 1024)).size())
test_downsample_block()

def test_upsample_block():
  upsample1 = Upsample(in_channels=512, out_channels=1024, kernel_size=(1, 5), stride=(1, 1))
  upsample2 = Upsample(in_channels=128, out_channels=512, kernel_size=(1, 5), stride=(1, 1))
  x = torch.randn(1, 512, 1, 1024)
  x = upsample1.forward(x)
  print(x.size())
  x = upsample2.forward(x)
  print(x.size())
test_upsample_block()

def test_generator():
  generator = Generator()
  print(generator.forward(torch.randn(1, 24, 1, 1024)).size())
test_generator()

def test_discriminator():
  discriminator = Discriminator()
  print(discriminator.forward(torch.randn(1, 1, 24, 128)).size())
test_discriminator()

torch.Size([1, 128, 1, 510])
torch.Size([1, 128, 2, 2040])
torch.Size([1, 64, 4, 4072])
torch.Size([1, 24, 4, 462])
torch.Size([1, 1])


In [158]:
class CycleGAN(torch.nn.Module):
      
      def __init__(self):
        super(CycleGAN, self).__init__()
        self.Gx_y = Generator()
        self.Gy_x = Generator()
        self.Dy = Discriminator()
        self.Dx = Discriminator()

In [165]:
def test_cyclegan():
  cycleGAN = CycleGAN()

  # Forward-inverse mapping
  x = torch.randn(1, 24, 1, 1024)
  y_hat = cycleGAN.Gx_y.forward(x)
  x_hat = cycleGAN.Gy_x.forward(y_hat)
  print(x_hat.size())

  cycleGAN.Dy.forward(y_hat).size()

  # Inverse-forward mapping
  y = torch.randn(1, 24, 1, 1024)
  x_hat = cycleGAN.Gy_x.forward(y)
  y_hat = cycleGAN.Gx_y.forward(x_hat)
  print(y_hat.size())

  # cycleGAN.Dx.forward(x_hat).size()

test_cyclegan()

torch.Size([1, 24, 16, 182])


RuntimeError: Given groups=1, weight of size [128, 1, 3, 3], expected input[1, 24, 4, 462] to have 1 channels, but got 24 channels instead