In [1]:
from google.colab import drive
drive.mount('/content/Drive')

Mounted at /content/Drive


In [2]:
import os
os.environ['KAGGLE_CONFIG_DIR'] = '/content/Drive/MyDrive/kaggle'

Download the Dataset

In [3]:
!kaggle datasets download -d a13x10/basic-arabic-vocal-emotions-dataset

Downloading basic-arabic-vocal-emotions-dataset.zip to /content
 87% 121M/138M [00:01<00:00, 102MB/s]
100% 138M/138M [00:01<00:00, 109MB/s]


In [4]:
# extract the dataset from the zipfile
import zipfile

file_path = '/content/basic-arabic-vocal-emotions-dataset.zip'

with zipfile.ZipFile(file_path, 'r') as zip_ref:
    zip_ref.extractall('/content/basic-arabic-vocal-emotions-dataset/')

# ***Import the necessary Libraries***

In [5]:
import torch
import torch.nn as nn
import torch.optim as optim
import torchvision
import torchvision.datasets as datasets
import torchvision.transforms as transforms
from torch.utils.data import DataLoader
import librosa
import random
import numpy as np
from torch.utils.tensorboard import SummaryWriter

In [6]:
import soundfile as sf

# ***Generator Class***

In [7]:
class Generator(nn.Module):
    def __init__(self, channels_noise, channels_img, features_g):
        super(Generator, self).__init__()

        # Sequential model for the generator
        self.gen = nn.Sequential(
            # First convolutional block
            self._block(channels_noise, features_g * 16, 4, 1, 0),

            # Second convolutional block
            self._block(features_g * 16, features_g * 8, 4, 2, 1),

            # Third convolutional block
            self._block(features_g * 8, features_g * 4, 4, 2, 1),

            # Fourth convolutional block
            self._block(features_g * 4, features_g * 2, 4, 2, 1),

            # Final convolutional layer with transpose operation
            nn.ConvTranspose2d(
                features_g * 2, channels_img, kernel_size=2, stride=2, padding=1
            ),

            # Tanh activation function to output audio in range [-1, 1]
            nn.Tanh(),
        )

    def _block(self, in_channels, out_channels, kernel_size, stride, padding):
        """Helper function to define a convolutional block with transpose convolution, batch normalization, and ReLU activation."""
        return nn.Sequential(
            # Transpose convolutional layer
            nn.ConvTranspose2d(
                in_channels,
                out_channels,
                kernel_size,
                stride,
                padding,
                bias=False,
            ),
            # Batch normalization to stabilize training
            nn.BatchNorm2d(out_channels),
            # ReLU activation function to introduce non-linearity
            nn.ReLU(),
        )

    def forward(self, x):
        """
        Forward pass of the generator.

        """
        return self.gen(x)


# ***Discriminator Class***

In [8]:
class Discriminator(nn.Module):
    def __init__(self, channels_img, features_d):
        super(Discriminator, self).__init__()

        # Sequential model for the discriminator
        self.disc = nn.Sequential(
            # First convolutional layer
            nn.Conv2d(channels_img, features_d, kernel_size=2, stride=2, padding=1),
            # LeakyReLU activation function to introduce non-linearity
            nn.LeakyReLU(0.2),
            # Second convolutional block
            self._block(features_d, features_d * 2, 4, 2, 1),
            # Third convolutional block
            self._block(features_d * 2, features_d * 4, 4, 2, 1),
            # Fourth convolutional block
            self._block(features_d * 4, features_d * 8, 4, 2, 1),
            # Final convolutional layer
            nn.Conv2d(features_d * 8, 1, kernel_size=2, stride=2, padding=0),
            # Sigmoid activation function to output probability
            nn.Sigmoid(),
        )

    def _block(self, in_channels, out_channels, kernel_size, stride, padding):
        """
        Helper function to define a convolutional block with convolution, batch normalization, and LeakyReLU activation.

        """
        return nn.Sequential(
            # Convolutional layer
            nn.Conv2d(
                in_channels,
                out_channels,
                kernel_size,
                stride,
                padding,
                bias=False,
            ),
            # Batch normalization to stabilize training
            nn.BatchNorm2d(out_channels),
            # LeakyReLU activation function to introduce non-linearity
            nn.LeakyReLU(0.2),
        )

    def forward(self, x):
        """
        Forward pass of the discriminator.

        """
        return self.disc(x)


In [9]:
def initialize_weights(model):
    for m in model.modules():
        if isinstance(m, (nn.Conv2d, nn.ConvTranspose2d, nn.BatchNorm2d)):
            nn.init.normal_(m.weight.data, 0.0, 0.02)

# ***Initialise Hyperparameters***

In [10]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
LEARNING_RATE = 2e-4
BATCH_SIZE = 64
channels_noise = 64
channels_img=  1
features_g = 64
features_d = 64

In [11]:
# Instantiate the generator and move it to the device (GPU or CPU)
gen = Generator(channels_noise, channels_img, features_g).to(device)

# Instantiate the discriminator and move it to the device (GPU or CPU)
disc = Discriminator(channels_img, features_d).to(device)

# Initialize the weights of the generator and discriminator
initialize_weights(gen)
initialize_weights(disc)


In [12]:
# Define the optimizer for the generator, specifying its parameters, learning rate, and beta values
opt_gen = optim.Adam(gen.parameters(), lr=LEARNING_RATE, betas=(0.5, 0.999))

# Define the optimizer for the discriminator, specifying its parameters, learning rate, and beta values
opt_disc = optim.Adam(disc.parameters(), lr=LEARNING_RATE, betas=(0.5, 0.999))

# Define the binary cross-entropy loss function
criterion = nn.BCELoss()


In [13]:
# Function to extract MFCC features from the audio files
def preprocess_wav(file_path):
    audio, sr = librosa.load(file_path, sr=None)
    mfccs = librosa.feature.mfcc(y=audio, sr=sr, n_mfcc=40)
    return mfccs


In [14]:
# Define the directory containing the dataset
filename = '/content/basic-arabic-vocal-emotions-dataset/remake'

# Initialize an empty list to store the extracted features
feats = []

# Iterate over the directories in the dataset directory
for i in os.listdir(filename):
    # Check if the directory name has a length of 1
    if len(i) == 1:
        # Construct the full path of the subdirectory
        filepath = os.path.join(filename, i)
        # Iterate over the files in the subdirectory
        for j in os.listdir(filepath):
            # Construct the full path of the audio file
            aud_path = os.path.join(filepath, j)
            # Extract MFCC features from the audio file
            mfcc = preprocess_wav(aud_path)
            # Append a tuple containing the filename and its corresponding MFCC features to the feats list
            feats.append((j, mfcc))

# Shuffle the list of tuples to randomize the order of the data
random.shuffle(feats)


In [15]:
len(feats)

1935

In [16]:
import torch
from torch.nn.utils.rnn import pad_sequence

# Define a function to pad or truncate MFCC features
def pad_truncate(features, max_len):
    if len(features[0]) < max_len:
        padded = [torch.cat([torch.tensor(feature), torch.zeros(max_len - len(feature))], dim=0) for feature in features]
    else:
        padded = [torch.tensor(feature)[:max_len] for feature in features]
    return torch.stack(padded)

# Assuming feats is a list of tuples (filename, mfcc)
max_len = max(len(mfcc) for _, mfcc in feats)  # Find the maximum length of MFCC features

# Pad or truncate MFCC features
padded_feats = [(filename, pad_truncate(mfcc, max_len)) for filename, mfcc in feats]


In [17]:
train_loader = DataLoader(padded_feats, batch_size=BATCH_SIZE, shuffle=True)

In [18]:
# Define a function to save generated audio
def save_generated_audio(generated_audio, epoch, batch_idx):
    for i, audio in enumerate(generated_audio):
        output_file_path = f"generated_audio_epoch{epoch}_batch{batch_idx}_example{i}.wav"
        sf.write(output_file_path, audio.squeeze().detach().cpu().numpy(), sr)

In [19]:
num_epochs = 5
sr = 16000
for epoch in range(num_epochs):
    for batch_idx, (filename, audio_features) in enumerate(train_loader):
        # Move audio features to device
        audio_features = audio_features.unsqueeze(1).to(device)

        ### Train Discriminator ###
        opt_disc.zero_grad()

        # Generate fake audio
        noise = torch.randn(len(audio_features), channels_noise, 1, 1).to(device)
        fake_audio = gen(noise)

        # Train discriminator on real audio
        disc_real_output = disc(audio_features).reshape(-1)
        disc_real_loss = criterion(disc_real_output, torch.ones_like(disc_real_output))

        # Train discriminator on fake audio
        disc_fake_output = disc(fake_audio.detach()).reshape(-1)
        disc_fake_loss = criterion(disc_fake_output, torch.zeros_like(disc_fake_output))

        # Total discriminator loss
        disc_loss = (disc_real_loss + disc_fake_loss) / 2

        # Backpropagation
        disc_loss.backward()
        opt_disc.step()

        ### Train Generator ###
        opt_gen.zero_grad()

        # Generate fake audio
        fake_audio = gen(noise)

        # Get discriminator's prediction on fake audio
        disc_fake_output = disc(fake_audio).reshape(-1)

        # Generator loss
        gen_loss = criterion(disc_fake_output, torch.ones_like(disc_fake_output))

        # Backpropagation
        gen_loss.backward()
        opt_gen.step()

        # Print losses
        if batch_idx % 100 == 0:
            print(
                f"Epoch [{epoch}/{num_epochs}] Batch {batch_idx}/{len(train_loader)} \
                  Loss D: {disc_loss:.4f}, loss G: {gen_loss:.4f}"
            )

        # Save generated audio every few batches
        if batch_idx % 500 == 0:
            with torch.no_grad():
                gen.eval()
                generated_audio = gen(torch.randn(5, channels_noise, 1, 1).to(device))
                save_generated_audio(generated_audio, epoch, batch_idx)
                gen.train()

Epoch [0/5] Batch 0/31                   Loss D: 0.6923, loss G: 0.7103
Epoch [1/5] Batch 0/31                   Loss D: 0.4358, loss G: 0.9853
Epoch [2/5] Batch 0/31                   Loss D: 0.2806, loss G: 1.3051
Epoch [3/5] Batch 0/31                   Loss D: 0.1699, loss G: 1.7126
Epoch [4/5] Batch 0/31                   Loss D: 0.1039, loss G: 2.1440
