In [18]:
import librosa
import numpy as np
import os

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

In [19]:
sample_rate = 22050
n_mels = 130
hop_length = 512
n_frames = 13
genres = ['blues', 'classical', 'country', 'disco', 'hiphop', 'jazz', 'metal', 'pop', 'reggae', 'rock']

In [20]:
def load_gtzan_data(data_path):
    X, y = [], []

    for genre in genres:
        genre_path = os.path.join(data_path, genre)
        for file in os.listdir(genre_path):
            if file.endswith('.wav'):
                file_path = os.path.join(genre_path, file)
                
                try:
                    audio, sr = librosa.load(file_path, sr=sample_rate)
                    mel_spec = librosa.feature.melspectrogram(y=audio,sr=sr,n_mels=n_mels,hop_length=hop_length)
                    mel_spec_db = librosa.power_to_db(mel_spec, ref=np.max)
                    if mel_spec_db.shape[1] >= n_frames:
                        mel_spec_db = mel_spec_db[:, :n_frames]
                    else:
                        mel_spec_db = np.pad(mel_spec_db, ((0, 0), (0, n_frames - mel_spec_db.shape[1])), mode='constant')


                    X.append(mel_spec_db)
                    y.append(genre)
                except Exception as e:
                    print(f"Error processing {file_path}: {e}")
    return np.array(X), np.array(y)

In [21]:
def augment_audio(audio, sr):

    audio_shifted = librosa.effects.pitch_shift(audio, n_steps=np.random.randint(-2, 2), sr=sr)
    audio_stretched = librosa.effects.time_stretch(audio, rate=np.random.uniform(0.8, 1.2))
    return audio_shifted, audio_stretched

In [22]:
import json


data_path = '../Data/data.json'

with open(data_path, "r") as fp:
    data = json.load(fp)

# Define X nd y
X = np.array(data["mfcc"])
y = np.array(data["genre_num"])
# Train-validation-test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.3, random_state=42, stratify=y_train)

X_train = X_train[..., np.newaxis]  # Add channel dimension
X_val = X_val[..., np.newaxis]      # Add channel dimension
X_test = X_test[..., np.newaxis]    # Add channel dimension

In [23]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv2D, MaxPooling2D, BatchNormalization, Dropout, GlobalAveragePooling2D, Dense

def build_fcnn(input_shape=(n_mels, n_frames, 1),num_classes=10):
    model = Sequential([
        #layer 1
        Conv2D(32, (3,3), activation='relu',padding='same',input_shape=input_shape),
        BatchNormalization(),
        MaxPooling2D((2, 2)),
        Dropout(0.3),

        #layer 2
        Conv2D(64, (3,3), activation='relu',padding='same'),
        BatchNormalization(),
        MaxPooling2D((2, 2)),
        Dropout(0.3),

        #layer 3
        Conv2D(128, (3,3), activation='relu',padding='same'),
        BatchNormalization(),   
        MaxPooling2D((2, 2)),
        Dropout(0.3),

        #layer 4
        Conv2D(128, (3,3), activation='relu',padding='same'),
        BatchNormalization(),
        GlobalAveragePooling2D(), #Replaces dense layers
        Dropout(0.3),
        Dense(64, activation='softmax')
    ])

    return model

In [24]:
model = build_fcnn()
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
model.summary()

Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 conv2d_8 (Conv2D)           (None, 130, 13, 32)       320       
                                                                 
 batch_normalization_8 (Bat  (None, 130, 13, 32)       128       
 chNormalization)                                                
                                                                 
 max_pooling2d_6 (MaxPoolin  (None, 65, 6, 32)         0         
 g2D)                                                            
                                                                 
 dropout_8 (Dropout)         (None, 65, 6, 32)         0         
                                                                 
 conv2d_9 (Conv2D)           (None, 65, 6, 64)         18496     
                                                                 
 batch_normalization_9 (Bat  (None, 65, 6, 64)        

## Training

In [27]:
from tensorflow.keras.callbacks import EarlyStopping

early_stopping = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)

history = model.fit(X_train, y_train,
                    validation_data=(X_val, y_val),
                    epochs=250,
                    batch_size=64,
                    callbacks=[early_stopping])
test_loss, test_accuracy = model.evaluate(X_test, y_test)
print(f"Test_accuracy : {test_accuracy:.4f}")

Epoch 1/250
Epoch 2/250
Epoch 3/250
Epoch 4/250
Epoch 5/250
Epoch 6/250
Epoch 7/250
Epoch 8/250
Epoch 9/250
Epoch 10/250
Epoch 11/250
Epoch 12/250
Epoch 13/250
Epoch 14/250
Epoch 15/250
Epoch 16/250
Epoch 17/250
Epoch 18/250
Epoch 19/250
Epoch 20/250
Epoch 21/250
Epoch 22/250
Epoch 23/250
Epoch 24/250
Epoch 25/250
Epoch 26/250
Epoch 27/250
Epoch 28/250
Epoch 29/250
Epoch 30/250
Epoch 31/250
Epoch 32/250
Epoch 33/250
Epoch 34/250
Epoch 35/250
Epoch 36/250
Epoch 37/250
Epoch 38/250
Epoch 39/250
Epoch 40/250
Epoch 41/250
Epoch 42/250
Epoch 43/250
Epoch 44/250
Epoch 45/250
Epoch 46/250
Epoch 47/250
Epoch 48/250
Epoch 49/250
Test_accuracy : 0.8705


In [29]:
model.save('../models/fcnn_melspec_gtzan.h5')