In [2]:
pip install librosa pydub

Collecting pydub
  Downloading pydub-0.25.1-py2.py3-none-any.whl (32 kB)
Installing collected packages: pydub
Successfully installed pydub-0.25.1


### Load the Data via Librosa


In [3]:
import librosa
import librosa.display
import matplotlib.pyplot as plt
from pydub import AudioSegment
import zipfile
import io
import os



In [4]:
folder_path = '/content/drive/MyDrive/MP3-Example'

genres = os.listdir(folder_path)
genres.remove('.DS_Store')
print(genres)

['New Age', 'Electronic', 'Blues', 'World', 'Rock', 'RnB', 'Pop', 'Metal', 'Latin', 'Reggae', 'Jazz', 'Rap', 'Country', 'Folk', 'Punk']


### Convert the files to .wav


In [5]:
data = []

for genre in genres:
    genre_folder_path = os.path.join(folder_path, genre)
    if genre_folder_path == "/content/MP3-Example/.DS_Store":
      continue
    for filename in os.listdir(genre_folder_path):

        #check iof the file is an mp3
        if filename.endswith(".mp3"):
            mp3_audio_path = os.path.join(genre_folder_path, filename)

            # Convert MP3 to WAV using pydub
            audio = AudioSegment.from_mp3(mp3_audio_path)
            audio.export('temp.wav', format='wav')

            # Load the WAV file with librosa
            wav_audio_path = 'temp.wav'
            y, sr = librosa.load(wav_audio_path, duration=30)  # Load the first 30 seconds so all the audio files have the same length

            #y represents the audio time series
            #sr represents the sampling rate, maybe try to reduce this as it is a high number
            #n_mfcc represents the number of MFCC's to return
            mfccs = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=13)
            # Visualize the waveform
            """plt.figure(figsize=(12, 4))
            librosa.display.waveshow(y, sr=sr)
            plt.title(f'{genre} - {filename} - Waveform')
            plt.show()"""

            # Append the loaded data to the dataset
            data.append((mfccs, genre))

# Clean up temporary files that are no longer needed
os.remove('temp.wav')

###Now Build the Model


In [46]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from keras.models import Sequential
from keras.layers import Conv2D, MaxPooling2D, Flatten, Dense
from keras.utils import to_categorical

x = []
y = []

for song in data:
  x.append(song[0])
  y.append(song[1])

song_data = np.array(x)
genres = np.array(y)

X = song_data.reshape(song_data.shape[0], song_data.shape[1], song_data.shape[2], 1)

print(X.shape)

label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(genre)
num_classes = len(label_encoder.classes_)
y_categorical = to_categorical(y_encoded, num_classes=num_classes)

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y_categorical, test_size=0.2, random_state=42)

# Define CNN model
model = Sequential([
    Conv2D(32, kernel_size=(3, 3), activation='relu', input_shape=X_train.shape[1:]),
    MaxPooling2D(pool_size=(2, 2)),
    Conv2D(64, kernel_size=(3, 3), activation='relu'),
    MaxPooling2D(pool_size=(2, 2)),
    Flatten(),
    Dense(128, activation='relu'),
    Dense(num_classes, activation='softmax')
])

# Compile model
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# Train model
model.fit(X_train, y_train, epochs=10, batch_size=32, validation_split=0.2)

# Evaluate model
loss, accuracy = model.evaluate(X_test, y_test)
print("Test Loss:", loss)
print("Test Accuracy:", accuracy)



(1500, 13, 1292, 1)
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Test Loss: 7.440633296966553
Test Accuracy: 0.09000000357627869
