In [0]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

In [2]:
! pip install torch==1.5.0 
!curl -s https://course.fast.ai/setup/colab | bash
!git clone https://github.com/mogwai/fastai_audio.git
%cd fastai_audio
!bash install.sh

Updating fastai...
Done.
Cloning into 'fastai_audio'...
remote: Enumerating objects: 40, done.[K
remote: Counting objects: 100% (40/40), done.[K
remote: Compressing objects: 100% (25/25), done.[K
remote: Total 1437 (delta 14), reused 35 (delta 14), pack-reused 1397[K
Receiving objects: 100% (1437/1437), 161.40 MiB | 31.04 MiB/s, done.
Resolving deltas: 100% (878/878), done.
/content/fastai_audio
Get:1 http://ppa.launchpad.net/graphics-drivers/ppa/ubuntu bionic InRelease [21.3 kB]
Get:2 http://ppa.launchpad.net/marutter/c2d4u3.5/ubuntu bionic InRelease [15.4 kB]
Hit:3 http://archive.ubuntu.com/ubuntu bionic InRelease
Get:4 http://archive.ubuntu.com/ubuntu bionic-updates InRelease [88.7 kB]
Get:5 http://security.ubuntu.com/ubuntu bionic-security InRelease [88.7 kB]
Get:6 http://archive.ubuntu.com/ubuntu bionic-backports InRelease [74.6 kB]
Get:7 https://cloud.r-project.org/bin/linux/ubuntu bionic-cran35/ InRelease [3,626 B]
Ign:8 https://developer.download.nvidia.com/compute/cuda/rep

In [0]:
from audio import *
from fastai.basics import *
from fastai.vision import *
from fastai.vision.gan import *

In [0]:
! unzip -q -n /content/drive/My\ Drive/sounds.zip #unzip sound files from the drive, but you can load any sound files

In [0]:
config_segment = AudioConfig(segment_size = 100) #audio preprocessing : making segments of 0.1 seconds to ease the process

In [6]:
tfms=None #no transforms needed
data_folder = '/content/fastai_audio/sounds' #path of the folder containing sound files
db_audio = (AudioList.from_folder(data_folder, config=config_segment) #load them in a list, then segmentation
                .split_none().label_from_folder() #no split, label will be 0 for all
               .transform(tfms=tfms) #no transform
               .databunch(bs=128)) #making batches of size 128, in the form of a databunch 

Preprocessing: Segmenting Items


In [8]:
db_audio #we can see the number of files, their duration, channels, sample rate and labels in the databunch

AudioDataBunch;

Train: AudioLabelList (6054 items)
x: AudioList
AudioItem 0.1 seconds (2 channels, 4410 samples @ 44100hz),AudioItem 0.1 seconds (2 channels, 4410 samples @ 44100hz),AudioItem 0.1 seconds (2 channels, 4410 samples @ 44100hz),AudioItem 0.1 seconds (2 channels, 4410 samples @ 44100hz),AudioItem 0.1 seconds (2 channels, 4410 samples @ 44100hz)
y: CategoryList
sounds,sounds,sounds,sounds,sounds
Path: /content/fastai_audio/sounds;

Valid: AudioLabelList (0 items)
x: AudioList

y: CategoryList

Path: /content/fastai_audio/sounds;

Test: None

In [0]:
x, y=db_audio.one_batch() #get one batch from the dataset

In [9]:
x.shape, y.shape #see what shape it has. 128 items in one batch, 2 channels, 128 rows, 18 columns, 128 labels

(torch.Size([128, 2, 128, 18]), torch.Size([128]))

In [0]:
ngpu = 1
device = torch.device("cuda:0" if (torch.cuda.is_available() and ngpu > 0) else "cpu") #check gpu use

In [12]:
class Generator(nn.Module): #Generative Model 
    def __init__(self):
        super(Generator, self).__init__()
        self.conv1 = nn.ConvTranspose1d(2, 128, 2) #we start with 2 channels, out_channels will be 128, kernel_size is 2
        self.bn1 = nn.BatchNorm1d(128)
        self.conv2 = nn.ConvTranspose1d(128, 128, 2) 
        self.bn2 = nn.BatchNorm1d(128)
        self.conv3 = nn.ConvTranspose1d(128, 256, 2)
        self.bn3 = nn.BatchNorm1d(256)
        self.conv4 = nn.ConvTranspose1d(256, 512, 2)
        self.bn4 = nn.BatchNorm1d(512)
        self.fc1 = nn.ConvTranspose1d(512, 1024, 2) #out_channels size is 1024
        self.th = nn.Tanh()
        
    def forward(self, x):
        x = self.conv1(x)
        x = F.relu(self.bn1(x))
        x = self.conv2(x)
        x = F.relu(self.bn2(x))
        x = self.conv3(x)
        x = F.relu(self.bn3(x))
        x = self.conv4(x)
        x = F.relu(self.bn4(x))
        x = self.fc1(x)
        x = self.th(x)
        return F.log_softmax(x, dim = 2)

netG = Generator()
netG.to(device) #putting the model on gpu
print(netG) 

Generator(
  (conv1): ConvTranspose1d(2, 128, kernel_size=(2,), stride=(1,))
  (bn1): BatchNorm1d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (conv2): ConvTranspose1d(128, 128, kernel_size=(2,), stride=(1,))
  (bn2): BatchNorm1d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (conv3): ConvTranspose1d(128, 256, kernel_size=(2,), stride=(1,))
  (bn3): BatchNorm1d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (conv4): ConvTranspose1d(256, 512, kernel_size=(2,), stride=(1,))
  (bn4): BatchNorm1d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (fc1): ConvTranspose1d(512, 1024, kernel_size=(2,), stride=(1,))
  (th): Tanh()
)


In [14]:
class Discriminator(nn.Module): #Critic Model
  def __init__(self):
          super(Discriminator, self).__init__()
          self.conv1 = nn.ConvTranspose1d(1024, 512, 2) #input size is the same as G's output : 1024
          self.bn1 = nn.BatchNorm1d(512)
          self.conv2 = nn.ConvTranspose1d(512, 256, 2)
          self.bn2 = nn.BatchNorm1d(256)
          self.conv3 = nn.ConvTranspose1d(256, 128, 2)
          self.bn3 = nn.BatchNorm1d(128)
          self.conv4 = nn.ConvTranspose1d(128, 128, 2)
          self.bn4 = nn.BatchNorm1d(128)
          self.fc1 = nn.ConvTranspose1d(128, 2, 2) #output size is going to be the same as original data : 2
          self.th = nn.Tanh()
          
  def forward(self, x):
          x = self.conv1(x)
          x = F.relu(self.bn1(x))
          x = self.conv2(x)
          x = F.relu(self.bn2(x))
          x = self.conv3(x)
          x = F.relu(self.bn3(x))
          x = self.conv4(x)
          x = F.relu(self.bn4(x))
          x = self.fc1(x)
          x = self.th(x)
          return F.log_softmax(x, dim = 2)

netD = Discriminator()
netD.to(device) #send to gpu
print(netD)

Discriminator(
  (conv1): ConvTranspose1d(1024, 512, kernel_size=(2,), stride=(1,))
  (bn1): BatchNorm1d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (conv2): ConvTranspose1d(512, 256, kernel_size=(2,), stride=(1,))
  (bn2): BatchNorm1d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (conv3): ConvTranspose1d(256, 128, kernel_size=(2,), stride=(1,))
  (bn3): BatchNorm1d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (conv4): ConvTranspose1d(128, 128, kernel_size=(2,), stride=(1,))
  (bn4): BatchNorm1d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (fc1): ConvTranspose1d(128, 2, kernel_size=(2,), stride=(1,))
  (th): Tanh()
)


In [0]:
G_loss = CrossEntropyFlat() #loss function
D_loss = CrossEntropyFlat()

In [0]:
learn = GANLearner(db_audio, netG, netD, gen_loss_func=G_loss, crit_loss_func=D_loss, switch_eval=False, gen_first=True) #learner wrapper from fastai : data, G and D models, loss functions

In [17]:
learn.fit(5) #RuntimeError: Expected 3-dimensional input for 3-dimensional weight [1024, 512, 2], but got 1-dimensional input of size [128] instead
# --> seems like a problem on the first layer of the Discriminator. Why is the input size [128] ?

epoch,train_loss,valid_loss,gen_loss,disc_loss,time


RuntimeError: ignored