## FCNN without dataslicing

In [5]:
import librosa
import numpy as np
import os

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import json


In [6]:
sample_rate = 22050
n_mels = 130
hop_length = 512
n_frames = 13
genres = ['blues', 'classical', 'country', 'disco', 'hiphop', 'jazz', 'metal', 'pop', 'reggae', 'rock']

In [7]:
def augment_audio(audio, sr):

    audio_shifted = librosa.effects.pitch_shift(audio, n_steps=np.random.randint(-2, 2), sr=sr)
    audio_stretched = librosa.effects.time_stretch(audio, rate=np.random.uniform(0.8, 1.2))
    return audio_shifted, audio_stretched

In [8]:
data_path = '/content/sample_data/data.json'

with open(data_path, "r") as fp:
    data = json.load(fp)

# Define X nd y
X = np.array(data["mfcc"])
y = np.array(data["genre_num"])
# Train-validation-test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.3, random_state=42, stratify=y_train)

X_train = X_train[..., np.newaxis]  # Add channel dimension
X_val = X_val[..., np.newaxis]      # Add channel dimension
X_test = X_test[..., np.newaxis]    # Add channel dimension

In [9]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv2D, MaxPooling2D, BatchNormalization, Dropout, GlobalAveragePooling2D, Dense

def build_fcnn(input_shape=(n_mels, n_frames, 1),num_classes=10):
    model = Sequential([
        #layer 1
        Conv2D(32, (3,3), activation='relu',padding='same',input_shape=input_shape),
        BatchNormalization(),
        MaxPooling2D((2, 2)),
        Dropout(0.3),

        #layer 2
        Conv2D(64, (3,3), activation='relu',padding='same'),
        BatchNormalization(),
        MaxPooling2D((2, 2)),
        Dropout(0.3),

        #layer 3
        Conv2D(128, (3,3), activation='relu',padding='same'),
        BatchNormalization(),
        MaxPooling2D((2, 2)),
        Dropout(0.3),

        #layer 4
        Conv2D(128, (3,3), activation='relu',padding='same'),
        BatchNormalization(),
        GlobalAveragePooling2D(), #Replaces dense layers
        Dropout(0.3),
        Dense(64, activation='softmax')
    ])

    return model

In [10]:
import tensorflow as tf
print("GPU available:", tf.config.list_physical_devices('GPU'))

GPU available: [PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]


In [None]:
# Step 5: Build, compile, and summarize model
from tensorflow.keras.optimizers import Adam
model = build_fcnn()
model.compile(optimizer=Adam(learning_rate=0.0001),  # Default Adam LR
              loss='sparse_categorical_crossentropy',
              metrics=['accuracy'])
model.summary()

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


## Training

In [12]:
from tensorflow.keras.callbacks import EarlyStopping,ReduceLROnPlateau

# Step 6: Train model on T4 GPU
early_stopping = EarlyStopping(monitor='val_loss', patience=20, restore_best_weights=True)
lr_scheduler = ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=10)

history = model.fit(X_train, y_train,
                    validation_data=(X_val, y_val),
                    epochs=250,
                    batch_size=64,
                    callbacks=[early_stopping,lr_scheduler])
test_loss, test_accuracy = model.evaluate(X_test, y_test)
print(f"Test_accuracy : {test_accuracy:.4f}")

Epoch 1/250
[1m77/77[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 127ms/step - accuracy: 0.2948 - loss: 3.0047 - val_accuracy: 0.2003 - val_loss: 4.5167 - learning_rate: 0.0010
Epoch 2/250
[1m77/77[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 7ms/step - accuracy: 0.5747 - loss: 1.3837 - val_accuracy: 0.4764 - val_loss: 1.5750 - learning_rate: 0.0010
Epoch 3/250
[1m77/77[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 8ms/step - accuracy: 0.6304 - loss: 1.1379 - val_accuracy: 0.5627 - val_loss: 1.2544 - learning_rate: 0.0010
Epoch 4/250
[1m77/77[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 8ms/step - accuracy: 0.6795 - loss: 0.9955 - val_accuracy: 0.5784 - val_loss: 1.3065 - learning_rate: 0.0010
Epoch 5/250
[1m77/77[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 8ms/step - accuracy: 0.7102 - loss: 0.8910 - val_accuracy: 0.6271 - val_loss: 1.1354 - learning_rate: 0.0010
Epoch 6/250
[1m77/77[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s

In [13]:
model.save('/content/sample_data/fcnn_melspec_gtzan.h5')



## FCNN with data slicing

In [None]:
import librosa
import numpy as np
import os
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv2D, MaxPooling2D, BatchNormalization, Dropout, GlobalAveragePooling2D, Dense
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.callbacks import EarlyStopping,ReduceLROnPlateau
import matplotlib.pyplot as plt
from tensorflow.keras.optimizers import Adam

# Enable mixed precision for faster GPU training
tf.keras.mixed_precision.set_global_policy('mixed_float16')

# Parameters
sample_rate = 22050
n_mels = 130
hop_length = 512
segment_length = 3  # 3-second clips
n_frames = int((segment_length * sample_rate / hop_length) + 1)  # ~129 for 3 seconds
genres = ['blues', 'classical', 'country', 'disco', 'hiphop', 'jazz', 'metal', 'pop', 'reggae', 'rock']
batch_size = 16
data_path = '/content/sample_data/data.json'  # Update with your GTZAN dataset path

# Function to split audio into 3-second segments
def split_audio(audio, sr, segment_length=3):
    samples_per_segment = int(segment_length * sr)
    segments = [audio[i:i + samples_per_segment] for i in range(0, len(audio), samples_per_segment)]
    return segments



# Load dataset
print("Loading and preprocessing data...")

with open(data_path, "r") as fp:
    data = json.load(fp)

# Define X nd y
X = np.array(data["mfcc"])
y = np.array(data["genre_num"])
# Train-validation-test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.3, random_state=42, stratify=y_train)

X_train = X_train[..., np.newaxis]  # Add channel dimension
X_val = X_val[..., np.newaxis]      # Add channel dimension
X_test = X_test[..., np.newaxis]    # Add channel dimension


# Create tf.data datasets for efficient loading
train_dataset = tf.data.Dataset.from_tensor_slices((X_train, y_train)).shuffle(buffer_size=1000).batch(batch_size).prefetch(tf.data.AUTOTUNE)
val_dataset = tf.data.Dataset.from_tensor_slices((X_val, y_val)).batch(batch_size).prefetch(tf.data.AUTOTUNE)
test_dataset = tf.data.Dataset.from_tensor_slices((X_test, y_test)).batch(batch_size).prefetch(tf.data.AUTOTUNE)

# Build simpler FCNN
def build_fcnn(input_shape=(n_mels, n_frames, 1), num_classes=10):
    model = Sequential([
        Conv2D(16, (3, 3), activation='relu', padding='same', input_shape=input_shape),
        BatchNormalization(),
        MaxPooling2D((2, 2)),
        Dropout(0.3),
        Conv2D(32, (3, 3), activation='relu', padding='same'),
        BatchNormalization(),
        MaxPooling2D((2, 2)),
        Dropout(0.3),
        Conv2D(64, (3, 3), activation='relu', padding='same'),
        BatchNormalization(),
        GlobalAveragePooling2D(),
        Dropout(0.3),
        Dense(num_classes, activation='softmax', dtype='float32')  # Mixed precision output
    ])
    return model

# Build and compile model
model = build_fcnn()
model.compile(optimizer=Adam(learning_rate=0.0001),  # Default Adam LR
              loss='sparse_categorical_crossentropy',
              metrics=['accuracy'])
model.summary()

# Train with early stopping
# Step 6: Train model on T4 GPU
early_stopping = EarlyStopping(monitor='val_loss', patience=20, restore_best_weights=True)
lr_scheduler = ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=10)

print("Training model...")
history = model.fit(
    train_dataset,
    validation_data=val_dataset,
    epochs=250,
    callbacks=[early_stopping,lr_scheduler]
)

# Evaluate on test set
test_loss, test_accuracy = model.evaluate(test_dataset)
print(f"Test Accuracy: {test_accuracy:.4f}")

# Plot training/validation accuracy
plt.plot(history.history['accuracy'], label='Train Accuracy')
plt.plot(history.history['val_accuracy'], label='Validation Accuracy')
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.legend()
plt.show()

Loading and preprocessing data...


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Training model...
Epoch 1/250
[1m306/306[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 25ms/step - accuracy: 0.3796 - loss: 1.7393 - val_accuracy: 0.5098 - val_loss: 1.3425 - learning_rate: 0.0010
Epoch 2/250
[1m306/306[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - accuracy: 0.5611 - loss: 1.2835 - val_accuracy: 0.5832 - val_loss: 1.2021 - learning_rate: 0.0010
Epoch 3/250
[1m306/306[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - accuracy: 0.5900 - loss: 1.1820 - val_accuracy: 0.6357 - val_loss: 1.0807 - learning_rate: 0.0010
Epoch 4/250
[1m306/306[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - accuracy: 0.6148 - loss: 1.1112 - val_accuracy: 0.6500 - val_loss: 1.0102 - learning_rate: 0.0010
Epoch 5/250
[1m306/306[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 5ms/step - accuracy: 0.6292 - loss: 1.0644 - val_accuracy: 0.5937 - val_loss: 1.1470 - learning_rate: 0.0010
Epoch 6/250
[1m306/306[0m [32m━━━━━━━━━━━━

In [None]:
model.save('/content/sample_data/fcnn_splice_songs_model.h5')

## FCNN with data augmentation

In [None]:
import librosa
import numpy as np
import os
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv2D, MaxPooling2D, BatchNormalization, Dropout, GlobalAveragePooling2D, Dense
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.callbacks import EarlyStopping,ReduceLROnPlateau
import matplotlib.pyplot as plt
from tensorflow.keras.optimizers import Adam
import json

# Enable mixed precision
tf.keras.mixed_precision.set_global_policy('mixed_float16')

# Parameters
sample_rate = 22050
n_mels = 130
hop_length = 512
segment_length = 3  # 3-second clips
n_frames = int((segment_length * sample_rate / hop_length) + 1)  # ~129
genres = ['blues', 'classical', 'country', 'disco', 'hiphop', 'jazz', 'metal', 'pop', 'reggae', 'rock']
batch_size = 32  # Increased for GPU
data_path = '/content/sample_data/data.json'  # Update with your GTZAN dataset path

# Audio augmentations
def augment_audio(audio, sr):
    # Time-reversal (50% chance)
    if np.random.rand() < 0.5:
        audio = audio[::-1]
    # Pitch shift (±2 semitones, 50% chance)
    if np.random.rand() < 0.5:
        n_steps = np.random.uniform(-2, 2)
        audio = librosa.effects.pitch_shift(audio, sr=sr, n_steps=n_steps)
    # Time stretch (0.8–1.2 rate, 50% chance)
    if np.random.rand() < 0.5:
        rate = np.random.uniform(0.8, 1.2)
        audio = librosa.effects.time_stretch(audio, rate=rate)
    return audio

# Load dataset
print("Loading and preprocessing data...")

with open(data_path, "r") as fp:
    data = json.load(fp)

# Define X nd y
X = np.array(data["mfcc"])
y = np.array(data["genre_num"])

# Train-validation-test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.3, random_state=42, stratify=y_train)

X_train = X_train[..., np.newaxis]  # Add channel dimension
X_val = X_val[..., np.newaxis]      # Add channel dimension
X_test = X_test[..., np.newaxis]    # Add channel dimension

# Create tf.data datasets
train_dataset = tf.data.Dataset.from_tensor_slices((X_train, y_train)).shuffle(buffer_size=1000).batch(batch_size).prefetch(tf.data.AUTOTUNE)
val_dataset = tf.data.Dataset.from_tensor_slices((X_val, y_val)).batch(batch_size).prefetch(tf.data.AUTOTUNE)
test_dataset = tf.data.Dataset.from_tensor_slices((X_test, y_test)).batch(batch_size).prefetch(tf.data.AUTOTUNE)

# Build FCNN
def build_fcnn(input_shape=(n_mels, n_frames, 1), num_classes=10):
    model = Sequential([
        Conv2D(16, (3, 3), activation='relu', padding='same', input_shape=input_shape),
        BatchNormalization(),
        MaxPooling2D((2, 2)),
        Dropout(0.3),
        Conv2D(32, (3, 3), activation='relu', padding='same'),
        BatchNormalization(),
        MaxPooling2D((2, 2)),
        Dropout(0.3),
        Conv2D(64, (3, 3), activation='relu', padding='same'),
        BatchNormalization(),
        GlobalAveragePooling2D(),
        Dropout(0.3),
        Dense(num_classes, activation='softmax', dtype='float32')
    ])
    return model

# Build and compile model
model = build_fcnn()
model.compile(optimizer=Adam(learning_rate=0.0001),  # Default Adam LR
              loss='sparse_categorical_crossentropy',
              metrics=['accuracy'])
model.summary()

# Train with early stopping
# Step 6: Train model on T4 GPU
early_stopping = EarlyStopping(monitor='val_loss', patience=20, restore_best_weights=True)
lr_scheduler = ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=10)
# Train
early_stopping = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)
print("Training model...")
history = model.fit(
    train_dataset,
    validation_data=val_dataset,
    epochs=250,
    callbacks=[early_stopping,lr_scheduler]
)

# Evaluate
test_loss, test_accuracy = model.evaluate(test_dataset)
print(f"Test Loss: {test_loss:.4f}")
print(f"Test Accuracy: {test_accuracy:.4f}")

# Plot results
plt.plot(history.history['accuracy'], label='Train Accuracy')
plt.plot(history.history['val_accuracy'], label='Validation Accuracy')
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.legend()
plt.show()