In [None]:
import os
import torch
import numpy as np
import torchvision as tv
from torch.utils.data import DataLoader,random_split,Dataset,TensorDataset
import torch.nn as nn
from torchvision import transforms,datasets
import torch.optim as optim
from torchmetrics import ConfusionMatrix
import matplotlib.pyplot as plt
import seaborn as sb
from torchmetrics.classification import MulticlassAccuracy
from torchvision.transforms import Resize,ToTensor,Compose
import librosa, IPython
import librosa.display as lplt
import random
from tqdm import tqdm
from torchaudio.transforms import Resample
import json
import math
from sklearn.metrics import confusion_matrix
import soundfile as sf
import shutil



## Audio Generation using GAN

In [None]:
data_path = '/kaggle/input/gtzan-dataset-music-genre-classification/Data/genres_original'
genres = ['blues', 'classical', 'country', 'disco', 'hiphop', 'jazz', 'metal', 'pop', 'reggae', 'rock']

# define dataset split
training_size = 0.7
validation_size = 0.2
testing_size = 0.1
batch_size = 64

# define epochs
epochs = 50

# define seed for reproducibility 
seed=7
random.seed(seed)
torch.manual_seed(seed)
device = "cuda"

# get sample rate and audio length
audio_example = '/kaggle/input/gtzan-dataset-music-genre-classification/Data/genres_original/country/country.00000.wav'
audio_data, sr = librosa.load(audio_example)
sample_num = len(audio_data)

aud_length = sample_num / sr
print("Sample rate:",sr)
print("Audio length:",aud_length)
print("Sequence length:",sample_num)

# extract MFCCs with coefficient of 13
mfccs = librosa.feature.mfcc(y=audio_data, sr=sr, n_mfcc=13)


In [None]:

music_array = [] 
genres = [] 
for root, dirs, files in os.walk(data_path):
    for name in files:
        filename = os.path.join(root, name)
        # skip corrupt file for processing
        if filename != '/data/genres_original/jazz/jazz.00054.wav':
            music_array.append(filename)
            genres.append(filename.split("/")[5])
            
        

### Feature Extraction

In [None]:
# define file path for extarcted feature dictionary
fe_file="/kaggle/working/fe.json"
samples = aud_length*sr

# function for feature extraction
def feature_extraction(data_path, fe_file, num_mfcc=13, n_fft=2048, hop_length=512, num_segments=5):
    
     # dictionary to store mfccs, target labels, and corresponding mappings
    audio_info = {
        "mapping": [],
        "labels": [],
        "mfcc": []
    }
    
    samples_per_seg = int(samples / num_segments)
    mfcc_vectors_per_seg = math.ceil(samples_per_seg / hop_length)

    
    # loop through all genre sub-folder
    for i, (dirpath, dirnames, filenames) in enumerate(os.walk(data_path)):

        if dirpath is not data_path:
            
            # append genre label
            genre_label = dirpath.split("/")[-1]
            audio_info["mapping"].append(genre_label)

            # process all audio files in genre sub-dir
            for f in filenames:

            # load audio file

                file_path = os.path.join(dirpath, f)
                
            # skip corrupt file jazz.00054
                if (file_path != '/kaggle/input/gtzan-dataset-music-genre-classification/Data/genres_original/jazz/jazz.00054.wav') and (file_path !='/kaggle/working/final/jazz/jazz.00054.wav'):

                    signal, sample_rate = librosa.load(file_path, sr=sr)
                
                
                    # process audio segments (split into 5) of audio file
                    for s in range(num_segments):

                        # find start and finish sample of current audio segment
                        start = samples_per_seg * s
                        finish = start + samples_per_seg

                        # extract mfccs and transpose
                        mfcc = librosa.feature.mfcc(y = signal[start:finish], sr=sample_rate, n_mfcc=num_mfcc, n_fft=n_fft, hop_length=hop_length)
                        mfcc = mfcc.T

                        # if mfcc feature length has expected number of vectors, append to dictionary
                        if len(mfcc) == mfcc_vectors_per_seg:
                            audio_info["mfcc"].append(mfcc.tolist())
                            audio_info["labels"].append(i-1)

    # save extarcted mfccs to defined json file
    with open(fe_file, "w") as fp:
        json.dump(audio_info, fp, indent=4)
        
#     torch.save(data,fe_file)

feature_extraction(data_path, fe_file, num_segments=5)

### Data Loading

In [None]:
# json_path = "/kaggle/working/fe.json"


def load_fe_data(json_path):
   
    with open(json_path, "r") as fp:
        fe_data = json.load(fp)

    X = np.array(fe_data["mfcc"])
    y = np.array(fe_data["labels"])
    z = np.array(fe_data['mapping'])
    return X, y, z


In [None]:
X,y,z = load_fe_data('/kaggle/working/fe.json')

sample_num = len(X)

train_samples = int(training_size * sample_num)
val_samples = int(validation_size * sample_num)
test_samples = sample_num - train_samples - val_samples

# split data as defined
train_data, val_data, test_data = random_split(ds, [train_samples, val_samples, test_samples])

# load data using DataLoader with batch size 64
train_loader = DataLoader(train_data, batch_size=64, shuffle=True)
val_loader = DataLoader(val_data, batch_size=64, shuffle=True)
test_loader = DataLoader(test_data, batch_size=64, shuffle=False)

# shape of mfccs
print("MFCCs shape:", mfccs.shape)

### GAN

In [None]:
# define generator
class Generator(nn.Module):
    def __init__(self, input_dim, output_dim):
        super(Generator, self).__init__()
        
        self.model = nn.Sequential(
                # feed initial ConvTranspose1d layer latent dimension of 1000
            
                nn.ConvTranspose1d(input_dim, 1024, 3, 15, 0, bias=False),
                nn.ReLU(0.3),
                nn.ConvTranspose1d(1024, 512, 3, 15, 2, bias=False),
                nn.ReLU(0.3),
                nn.ConvTranspose1d(512, 256, 3, 15, 2, bias=False),
                nn.ReLU(0.3),
                nn.ConvTranspose1d(256, 216, 3, 10, 2, bias=False),
                nn.ReLU(0.3),
                nn.ConvTranspose1d(216, 216, 3, 10, 2, bias=False),
                nn.ReLU(0.3),
        )

        
    def forward(self, x):
        
        # permute shape to fit network input 
        
        x = x.unsqueeze(-1)
        x = torch.permute(x,(0,1,2))
    

        return self.model(x)
    
# define discriminator 
class Discriminator(nn.Module):
    def __init__(self, input_dim):
        super(Discriminator, self).__init__()

        # input shape of discriminator will be output shape of generator
        
        self.model = nn.Sequential(
            nn.Conv1d(input_dim, 128, 3, 4, 2, bias=False),
            nn.ReLU(0.2),
            nn.Conv1d(128, 256, 3, 4, 2, bias=False),
            nn.ReLU(0.2),
            nn.Conv1d(256, 512, 3, 4, 2, bias=False),
            nn.ReLU(0.2),
            nn.Conv1d(512, 3998, 3, 4, 2, bias=False),
            nn.ReLU(0.2))
        
    def forward(self, x):
#         x = torch.permute(x,(0,2,1))

        return self.model(x)

# define latent dimension
input_dim= 1000

# instantiate generator and discriminator
generator = Generator(input_dim,13).to(device)
discriminator = Discriminator(216).to(device)


# define loss function 
criterion = nn.BCEWithLogitsLoss()

# define optimisers
generator_optimizer = optim.Adam(generator.parameters(), lr=0.0001)
discriminator_optimizer = optim.Adam(discriminator.parameters(), lr=0.0001)


# training loop
num_epochs = 50
for epoch in range(num_epochs):
    for(data, targets) in (train_loader):
  
        data = data.to(device=device).squeeze(1)
        data = data.float()
        
        # generate noise to feed to generator
        noise = torch.randn(batch_size, input_dim).to(device=device)
    
        # generate fake samples from generator 
        fake_music = generator(noise)

        # train the discriminator network
        discriminator_optimizer.zero_grad()
        
        # get output from discriminator fed real music
        real_output = discriminator(data)
        
        # get output from discriminator fed fake music 
        fake_output = discriminator(fake_music.detach())  
        
        # get discriminator loss
        d_loss_real = criterion(real_output, torch.ones_like(real_output))
        d_loss_fake = criterion(fake_output, torch.zeros_like(fake_output))
        d_loss = d_loss_real + d_loss_fake
        d_loss.backward()
        discriminator_optimizer.step()

        # train generator
        generator_optimizer.zero_grad()
        
        # get fake output for discriminator
        fake_output = discriminator(fake_music.detach())
        
        # get generator loss
        g_loss = criterion(fake_output, torch.ones_like(fake_output))  
        g_loss.backward()
        generator_optimizer.step()

    print(f'Epoch [{epoch+1}/{num_epochs}], 'f'Disc_loss: {d_loss.item():.4f}, Gen_loss: {g_loss.item():.4f}')


### Save generated audio

In [None]:

# list to store audio samples
generated_audio_samples = []

# loop to generate 64 samples
for _ in range(1):
    with torch.no_grad():
        
        # generate noise to feed generator
        s_noise = torch.randn(batch_size, 1000, device=device)  
        gen_audio = generator(s_noise).detach().cpu()    

        # processing samples to save in right format
        for i in range(gen_audio.shape[0]):
            
            # average channels
            mono_fin_audio = gen_audio[i].mean(0)
    
            # normalise audio and convert to numpy array
            mono_fin_audio = torch.clamp(mono_fin_audio, -1, 1).numpy()  
            
            # save audio file 
            filename = f'/kaggle/working/generated_audio_{i}.wav'
            sf.write(filename, mono_fin_audio, 22050) 
  

### Re-process generated audio

In [None]:

# define source and destination files

source = "/kaggle/input/gtzan-dataset-music-genre-classification/Data/genres_original"
des = "/kaggle/working/final"

# duplicate original genre folders
shutil.copytree(source, des)


In [None]:

generated_ds_path = '/kaggle/working/'

ds_path = '/kaggle/working/final'
to_move = [file for file in os.listdir(generated_ds_path) if file.endswith(".wav")]

# move generated audio files to respective genre folders if they are WAV files

for file in to_move:
    source_file = os.path.join(generated_ds_path, file)
    destination_file = os.path.join(ds_path, file)
    
    shutil.move(source_file, destination_file)  