# **Data augmentation research** #

In [5]:
import librosa
import soundfile as sf
import os
import numpy as np
from sklearn.preprocessing import StandardScaler
from tensorflow import keras
import shutil
import cv2
import matplotlib.pyplot as plt
import multiprocessing
import concurrent.futures
from time import perf_counter

**The following cell is for clearing the output directory from kaggle; run it only if needed**

In [None]:
# Specify the path to the output directory
output_directory = "/kaggle/working/"

# Iterate through the files and subdirectories in the output directory
for item in os.listdir(output_directory):
    item_path = os.path.join(output_directory, item)
    if os.path.isfile(item_path):
        os.remove(item_path)
    elif os.path.isdir(item_path):
        shutil.rmtree(item_path)

print("Output directory cleared.")

In [None]:
os.makedirs("Spectrograms_computed")
filenames = ['blues', 'classical', 'country', 'disco', 'hiphop','jazz', 'metal', 'pop', 'reggae', 'rock']

for filename in filenames:
    os.makedirs("Spectrograms_computed/"+filename)

In [7]:
'Define the function to compute and save the spectograms'

def spectrogram_creation(audio_path, song_name, spectrogram_save_path, segment_duration=30, num_segments=1, augmentation_percentage=-1):
    # Load audio file
    y, sr = librosa.load(audio_path)

    # Create Figure and Axes objects
    fig, ax = plt.subplots(figsize=(10, 4))

    for segment_index in range(num_segments):
        start_sample = int(segment_index * segment_duration * sr)
        end_sample = int((segment_index + 1) * segment_duration * sr)

        # Extract the segment from the loaded audio
        y_segment = y[start_sample:end_sample]
        
        # Apply augmentation with a certain probability
        if np.random.rand() < augmentation_percentage:
            spectrogram_save_path = spectrogram_save_path + "Aug"
            y_segment = apply_random_augmentation(y_segment, sr)

        # Compute the spectrogram for the segment and convert to dB
        spectrogram = librosa.feature.melspectrogram(y=y, sr=sr)
        spectrogram_db = librosa.power_to_db(spectrogram, ref=np.max)

        # Plot the spectrogram on the existing Axes
        librosa.display.specshow(spectrogram_db)

        # Save the figure to the file
        fig.savefig(f'{spectrogram_save_path}_segment_{segment_index + 1}-{song_name}.png')

        # Clear the existing plot for the next iteration
        ax.cla()

    # Close the Figure to release resources
    plt.close(fig)

# Choose a random data augmentation method to apply on the signal
def apply_random_augmentation(signal, sr):
    # Randomly choose an augmentation function or return the original signal
    augmentation_functions = [add_white_noise, pitch_scale, random_gain, nothing]
    # augmentation_functions = [nothing]
    chosen_function = np.random.choice(augmentation_functions)

    # Apply the chosen augmentation function
    augmented_signal = chosen_function(signal, sr)

    return augmented_signal

def nothing(signal, sr):
    return signal

def add_white_noise(signal, sr, noise_percentage_factor=0.005):
    noise = np.random.normal(0, signal.std(), signal.size)
    augmented_signal = signal + noise * noise_percentage_factor
    return augmented_signal

def pitch_scale(signal, sr, num_semitones=2):
    return librosa.effects.pitch_shift(y=signal, sr=sr, n_steps=num_semitones)

def random_gain(signal, sr, min_factor=0.8, max_factor=1.2):
    gain_rate = np.random.uniform(min_factor, max_factor)
    augmented_signal = signal * gain_rate
    return augmented_signal


One audios from the dataset is not used: 

* one audio that cannot be loaded neither with librosa or soundfile libraries.


In [8]:
'Iterate throught the genres folders, compute the spectograms and create the new spectrogram dataset'

dataset_path = '/kaggle/input/gtzan-dataset-music-genre-classification/Data/genres_original'
spec_path = '/kaggle/working/Spectrograms_computed/'

dataset_spec = []

# if multi-threading spectrogram creation uncomment this line, otherwise comment this line
# plt.switch_backend('agg')

# Multi-processing
def process_genre(genre):
    count_song_genre = 1
    genre_path = os.path.join(dataset_path, genre)
    print(genre_path)
    for song in os.listdir(genre_path):
        song_path = os.path.join(genre_path, song)
        # Split the string on the period (.)
        split_words = song.split('.')

        # Take the first two words and concatenate them with an underscore
        song = '.'.join(split_words[:2])
        try:
            spec_path_genre = f"{spec_path}{genre}/"
            spectrogram_creation(song_path,  song, f"{spec_path_genre}{genre}{count_song_genre}")
            count_song_genre += 1
        except Exception as e:
            print(e)
        
start = perf_counter()
with multiprocessing.Pool() as pool:
        genres = os.listdir(dataset_path)
        pool.map(process_genre, genres)
end = perf_counter()

print(f"Total time: {(end-start) / 60}")
                



/kaggle/input/gtzan-dataset-music-genre-classification/Data/genres_original/metal/kaggle/input/gtzan-dataset-music-genre-classification/Data/genres_original/reggae/kaggle/input/gtzan-dataset-music-genre-classification/Data/genres_original/blues/kaggle/input/gtzan-dataset-music-genre-classification/Data/genres_original/disco



/kaggle/input/gtzan-dataset-music-genre-classification/Data/genres_original/rock
/kaggle/input/gtzan-dataset-music-genre-classification/Data/genres_original/classical
/kaggle/input/gtzan-dataset-music-genre-classification/Data/genres_original/jazz
/kaggle/input/gtzan-dataset-music-genre-classification/Data/genres_original/hiphop


  y, sr = librosa.load(audio_path)
	Deprecated as of librosa version 0.10.0.
	It will be removed in librosa version 1.0.
  y, sr_native = __audioread_load(path, offset, duration, dtype)



/kaggle/input/gtzan-dataset-music-genre-classification/Data/genres_original/country
/kaggle/input/gtzan-dataset-music-genre-classification/Data/genres_original/pop
Total time: 3.0835197758333304


**Check if the number of files from each directory is Correct**

In [9]:
# Define the parent directory you want to analyze
parent_directory = '/kaggle/working/Spectrograms_computed'
# parent_directory = "/kaggle/input/gtzan-dataset-music-genre-classification/Data/images_original"

# Create an empty dictionary to store directory names and file counts
directory_file_counts = {}

# Walk through the parent directory and count files in each subdirectory
for root, dirs, files in os.walk(parent_directory):
    # Count the files in the current directory
    file_count = len(files)
    
    # Store the directory name and file count in the dictionary
    directory_file_counts[root] = file_count

# Print the directory names and their respective file counts
for directory, file_count in directory_file_counts.items():
    print(f"Directory: {directory}, File Count: {file_count}")

Directory: /kaggle/working/Spectrograms_computed, File Count: 0
Directory: /kaggle/working/Spectrograms_computed/jazz, File Count: 99
Directory: /kaggle/working/Spectrograms_computed/metal, File Count: 100
Directory: /kaggle/working/Spectrograms_computed/country, File Count: 100
Directory: /kaggle/working/Spectrograms_computed/blues, File Count: 100
Directory: /kaggle/working/Spectrograms_computed/classical, File Count: 100
Directory: /kaggle/working/Spectrograms_computed/pop, File Count: 100
Directory: /kaggle/working/Spectrograms_computed/reggae, File Count: 100
Directory: /kaggle/working/Spectrograms_computed/disco, File Count: 100
Directory: /kaggle/working/Spectrograms_computed/rock, File Count: 100
Directory: /kaggle/working/Spectrograms_computed/hiphop, File Count: 100


In [10]:
classes = [a for a in os.listdir('/kaggle/input/gtzan-dataset-music-genre-classification/Data/images_original') if '.' not in a]
print(classes)

['disco', 'metal', 'reggae', 'blues', 'rock', 'classical', 'jazz', 'hiphop', 'country', 'pop']


In [11]:
'Create X and Y for: train, valid, test'

img_size = 256
dataset=[]
for label in classes:
    path = os.path.join(parent_directory, label)
    class_num = classes.index(label)
    
    for img in os.listdir(path):
        try:
            img_arr = cv2.imread(os.path.join(path, img))[...,::-1] #convert BGR to RGB format
            resized_arr = cv2.resize(img_arr, (img_size, img_size)) # Reshaping images to preferred size
            dataset.append([resized_arr, class_num])
        except Exception as e:
            print(e)

dataset_toUse = np.array(dataset)
print(dataset_toUse.shape)

(999, 2)


  dataset_toUse = np.array(dataset)


In [12]:
X = []
Y = []
for feature, label in dataset_toUse:
    X.append(feature)
    Y.append(label)

X = np.array(X) / 255.
Y = np.array(Y)

# X = np.load('/kaggle/input/gtzanseg6/X.npy')
# Y = np.load('/kaggle/input/gtzanseg6/Y.npy')

print(X.shape)
print(Y.shape)

(999, 256, 256, 3)
(999,)


# **Transfer learning part** #

In [13]:
import tensorflow as tf
from tensorflow.keras import applications, layers, models, regularizers
from tensorflow.keras.optimizers import Adam
from keras.applications.vgg16 import VGG16
from tensorflow.keras.losses import SparseCategoricalCrossentropy
from sklearn.model_selection import train_test_split
from keras.models import Sequential
from keras.layers import Dense, Conv2D , MaxPool2D , Flatten , Dropout
from tensorflow.keras.callbacks import EarlyStopping

In [14]:
x_train, x_aux, y_train, y_aux = train_test_split(X,Y, test_size = 0.2, random_state = 42)
x_test, x_valid, y_test, y_valid = train_test_split(x_aux, y_aux, test_size = 0.5, random_state = 42)

# x_train.reshape(-1, img_size, img_size, 1)

print(x_train.shape, y_train.shape)
print(x_valid.shape, y_valid.shape)
print(x_test.shape, y_test.shape)


(799, 256, 256, 3) (799,)
(100, 256, 256, 3) (100,)
(100, 256, 256, 3) (100,)


In [15]:
model = Sequential()
model.add(Conv2D(32,3,padding="same", activation=tf.keras.layers.LeakyReLU(alpha=0.01), input_shape=(256,256,3), kernel_regularizer='l2'))
model.add(MaxPool2D())

model.add(Conv2D(32, 3, padding="same", activation=tf.keras.layers.LeakyReLU(alpha=0.01), kernel_regularizer='l2'))
model.add(MaxPool2D())

model.add(Conv2D(64, 3, padding="same", activation=tf.keras.layers.LeakyReLU(alpha=0.01), kernel_regularizer='l2'))
model.add(MaxPool2D())

model.add(Conv2D(64, 3, padding="same", activation=tf.keras.layers.LeakyReLU(alpha=0.01), kernel_regularizer='l2'))
model.add(MaxPool2D())

model.add(Flatten())
model.add(Dense(128,activation=tf.keras.layers.LeakyReLU(alpha=0.01)))
model.add(Dropout(0.2))
model.add(Dense(10, activation="softmax"))

# Compile the model
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Fit the model
model.fit(x_train, y_train, validation_data=(x_valid, y_valid), epochs=40, batch_size=16)

# Evaluate the model on the test set
results = model.evaluate(x_test, y_test)

# Print the evaluation results
print("Test Loss:", results[0])
print("Test Accuracy:", results[1])


Epoch 1/40
Epoch 2/40
Epoch 3/40
Epoch 4/40
Epoch 5/40
Epoch 6/40
Epoch 7/40
Epoch 8/40
Epoch 9/40
Epoch 10/40
Epoch 11/40
Epoch 12/40
Epoch 13/40
Epoch 14/40
Epoch 15/40
Epoch 16/40
Epoch 17/40
Epoch 18/40
Epoch 19/40
Epoch 20/40
Epoch 21/40
Epoch 22/40
Epoch 23/40
Epoch 24/40
Epoch 25/40
Epoch 26/40
Epoch 27/40
Epoch 28/40
Epoch 29/40
Epoch 30/40
Epoch 31/40
Epoch 32/40
Epoch 33/40
Epoch 34/40
Epoch 35/40
Epoch 36/40
Epoch 37/40
Epoch 38/40
Epoch 39/40
Epoch 40/40
Test Loss: 2.579878330230713
Test Accuracy: 0.5600000023841858


In [16]:
# Load the model chose for transfer learning, excluding the top (classification) layer
base_model =  tf.keras.applications.MobileNetV2(weights='imagenet', include_top=False, input_shape=(256, 256, 3))

for layer in base_model.layers:
    layer.trainable = False
  
    
x = base_model.output 
# Add your custom classification head
x = keras.layers.GlobalAveragePooling2D()(x)
x = layers.Dense(128, activation=tf.keras.layers.LeakyReLU(alpha=0.005))(x)
x = layers.Dropout(0.2)(x)
output = layers.Dense(10, activation='softmax')(x)  # 10 classes for music genres

# Create a new model with the custom input and classification head
model = models.Model(inputs=base_model.input, outputs=output)

# model.summary()

# Compile the model
model.compile(optimizer=tf.keras.optimizers.Adam(lr=0.0001), loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# callbacks = [EarlyStopping(monitor='val_loss', patience=7, verbose=1)]

# Fit the model
model.fit(x_train, y_train, validation_data=(x_valid, y_valid), epochs=30, batch_size=32)

# Evaluate the model on the test set
results = model.evaluate(x_test, y_test)

# Print the evaluation results
print("Test Loss:", results[0])
print("Test Accuracy:", results[1])

Downloading data from https://storage.googleapis.com/tensorflow/keras-applications/mobilenet_v2/mobilenet_v2_weights_tf_dim_ordering_tf_kernels_1.0_224_no_top.h5
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30
Test Loss: 0.8211221098899841
Test Accuracy: 0.7599999904632568


In [17]:
from tensorflow.keras.applications import Xception

base_model = Xception(weights='imagenet', include_top=False, input_shape=(256, 256, 3))
# Freeze the layers of the pre-trained model
for layer in base_model.layers:
    layer.trainable = False
    
base_model.layers[-1].trainable = True

# Define the custom classification head
x = base_model.output
x = keras.layers.GlobalAveragePooling2D()(x)
x = layers.Dense(128, activation='relu')(x)
x = layers.Dropout(0.2)(x)
output = layers.Dense(10, activation='softmax')(x)  # 10 classes for music genres

# Create a new model with the custom input and classification head
model = models.Model(inputs=base_model.input, outputs=output)

# Compile the model
model.compile(optimizer=tf.keras.optimizers.SGD(lr=0.01, momentum = 0.7), loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Fit the model
model.fit(x_train, y_train, validation_data=(x_valid, y_valid), epochs=50, batch_size=8)

# Evaluate the model on the test set
results = model.evaluate(x_test, y_test)
# Print the evaluation results
print("Test Loss:", results[0])
print("Test Accuracy:", results[1])

Downloading data from https://storage.googleapis.com/tensorflow/keras-applications/xception/xception_weights_tf_dim_ordering_tf_kernels_notop.h5
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50
Test Loss: 1.2361987829208374
Test Accuracy: 0.6600000262260437
