In [1]:
import librosa
from sklearn.model_selection import train_test_split
import numpy as np
from google.colab import drive
import os
import json
import tensorflow.keras as keras
import pickle #Para exportar el modelo

In [26]:
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
data_path = "/content/drive/My Drive/Machine Learning TP/Datasets/Audio/jsondataaudio.json"
epochs = 40 #Cantidad de ciclos completos de entrenamiento de todos los datos 
batch_size = 16 #Cantidad de datos utilizados en las iteraciones de entrenamientos
num_keywords = 10 #Numero de generos de canciones
neuronas_por_capa = 128

In [4]:
def load_dataset(data_path):
  with open(data_path, "r") as fp:
    data = json.load(fp)

  #Transformamos las filas de mfcc y labels que estaban como listas a numpay arrays
  X = np.array(data["mfcc"])
  y = np.array(data["labels"])

  return X, y

In [5]:
def get_data_splits(data_path, test_size = 0.2, test_validation_size = 0.1):
  #cargamos el dataset
  X, y = load_dataset(data_path)

  #creamos los train/validation/test splits
  X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = test_size)
  X_train, X_validation, y_train, y_validation = train_test_split(X_train, y_train, test_size = test_validation_size )

  return X_train, X_validation, X_test, y_train, y_validation, y_test

In [6]:
def predict(X, y, model):
  X = X[np.newaxis, ...]
  prediccion = model.predict(X)
  prediccion = np.argmax(prediccion, axis=1)
  print(f"Esperado: {y} Prediccion: {prediccion}")


In [7]:
def build_model(input_shape):
  #RNN
  #build network
  model = keras.Sequential()

  #lstm layer 1
  model.add(keras.layers.LSTM(neuronas_por_capa, input_shape = input_shape, return_sequences = True))

  #lstm layer 2 recibe la secuencia de layer 1, pero ahora devuelve un vector
  model.add(keras.layers.LSTM(neuronas_por_capa))

  #dense layer
  model.add(keras.layers.Dense(neuronas_por_capa, activation = "relu"))

  #para reducir overfitting
  model.add(keras.layers.Dropout(0.3))

  #softmax classifier
  model.add(keras.layers.Dense(num_keywords, activation = "softmax"))
  #compile the model

  return model

Cargar datos de entrenamiento/validacion/testeo

In [8]:
X_train, X_validation, X_test, y_train, y_validation, y_test = get_data_splits(data_path) 

modelo

In [9]:
input_shape = (X_train.shape[1], X_train.shape[2]) #Es la forma de input que la RNN espera recibir
model = build_model(input_shape)

In [10]:
X_train.shape[1]

130

In [11]:
X_train[1]

array([[-34.84450531,  69.19416809,   7.18058872, ...,   2.24903345,
          8.0829401 ,  -4.05803204],
       [-45.11981964,  66.3653717 ,  10.88868046, ...,   5.27150631,
          8.87207222,  -6.32608461],
       [-57.92111588,  67.58872986,  17.56464386, ...,  10.76859665,
         10.5799427 ,  -7.9318924 ],
       ...,
       [  2.4099164 ,  69.01014709, -10.54534912, ...,   1.62769592,
         -3.445333  ,   8.50748253],
       [ -6.40112925,  80.01934052, -11.05541992, ...,   2.31869721,
         -3.95041037,  11.12710571],
       [-12.39131355,  89.37961578,  -6.99657393, ...,  -0.55116796,
         -3.91489124,  10.99953651]])

In [12]:
X_train.shape[2]

13

In [13]:
X_train[2]

array([[-374.94509888,  140.54135132,   10.47996712, ...,   -5.33221674,
          -8.97377205,   -1.52721667],
       [-378.9916687 ,  137.137146  ,   15.81788731, ...,   -3.85990477,
          -5.98600197,   -2.0685873 ],
       [-391.92739868,  123.05625916,   17.60031128, ...,   -1.7682755 ,
          -3.25207329,   -2.70354295],
       ...,
       [-224.27722168,   87.91098022,  -12.95156288, ...,  -11.75883865,
           1.16182923,  -10.77908516],
       [-233.71087646,   90.4940033 ,  -21.31600189, ...,  -15.76560783,
          -6.64618206,   -8.98813248],
       [-233.99237061,   94.3560257 ,  -31.77109528, ...,  -18.94568253,
          -8.21642494,   -5.36473417]])

In [14]:
optimizador = keras.optimizers.Adam(learning_rate = 0.0001)
model.compile(optimizer = optimizador,
              loss = "sparse_categorical_crossentropy",
              metrics = ["accuracy"]) #metrics despues nos ayudara a visualizar la precision con model.fit

entrenar

In [16]:
model.fit(X_train,
          y_train,
          epochs = epochs,
          batch_size = batch_size,
          validation_data = (X_validation, y_validation)
          )          #validation_data = (X_validation, y_validation)

Epoch 1/40
Epoch 2/40
Epoch 3/40
Epoch 4/40
Epoch 5/40
Epoch 6/40
Epoch 7/40
Epoch 8/40
Epoch 9/40
Epoch 10/40
Epoch 11/40
Epoch 12/40
Epoch 13/40
Epoch 14/40
Epoch 15/40
Epoch 16/40
Epoch 17/40
Epoch 18/40
Epoch 19/40
Epoch 20/40
Epoch 21/40
Epoch 22/40
Epoch 23/40
Epoch 24/40
Epoch 25/40
Epoch 26/40
Epoch 27/40
Epoch 28/40
Epoch 29/40
Epoch 30/40
Epoch 31/40
Epoch 32/40
Epoch 33/40
Epoch 34/40
Epoch 35/40
Epoch 36/40
Epoch 37/40
Epoch 38/40
Epoch 39/40
Epoch 40/40


<keras.callbacks.History at 0x7f6d79999910>

testeo

In [17]:
test_error, test_accuracy = model.evaluate(X_test, y_test)
print(f"Test error: {test_error}, test accuracy: {test_accuracy}")

Test error: 0.9194180369377136, test accuracy: 0.7632632851600647


In [20]:
X = X_test[100] #Cualquiera en el de test
y = y_test[100]
predict(X, y, model)

Esperado: 1 Prediccion: [1]


guardar modelo

In [30]:
#guardar el modelo
with open('/content/drive/My Drive/Machine Learning TP/Modelos/Audio/Genero_model.pkl', 'wb') as handle:
    pickle.dump(model, handle, pickle.HIGHEST_PROTOCOL)


#model.save(save_path_model)



INFO:tensorflow:Assets written to: ram://0b7d49cf-4708-482b-b99a-3f7ef8bde11c/assets


INFO:tensorflow:Assets written to: ram://0b7d49cf-4708-482b-b99a-3f7ef8bde11c/assets


# Predicciones

Aca estaran todas las funciones necesarias para la prediccion con el modelo.

In [51]:
pip install pydub

Collecting pydub
  Downloading pydub-0.25.1-py2.py3-none-any.whl (32 kB)
Installing collected packages: pydub
Successfully installed pydub-0.25.1


In [71]:
import audioread
import math

In [65]:
#Datos necesarios para la prediccion, audio path para obtener un archivo de audio cualquiera
#y data path para obtener los generos y luego poder clasificar con la prediccion.
audio_path = "/content/drive/My Drive/8_-_My_Iron_Lung_-_Radiohead.mp3"
#audio_path = "/content/drive/My Drive/Machine Learning TP/Datasets/Audio/Data/genres_original/jazz/jazz.00099.wav"
data_path = "/content/drive/My Drive/Machine Learning TP/Datasets/Audio/jsondataaudio.json"

frecuencia_muestra = 22050

#Tomamos la duracion del audio
with audioread.audio_open(audio_path) as f:
  duracion = f.duration
#duracion = 30
num_de_muestras_por_audio = frecuencia_muestra * 30

In [39]:
#Cargamos el modelo entrenado, ya que cuesta bastante tiempo entrenarlo
def cargar_modelo():
  with open('/content/drive/My Drive/Machine Learning TP/Modelos/Audio/Genero_model.pkl', 'rb') as handle:
      model = pickle.load(handle)
  return model

In [40]:
def cargar_generos(data_path):
  with open(data_path, "r") as fp:
    data = json.load(fp)
    
  X = np.array(data["generos"])

  return X

In [41]:
#Procesamos el audio para obtener el mfcc, similar a la implementacion del archivo para pasar los audios a json.
def procesar_audio(audio_path, n_mfcc = 13, n_fft = 2048, hop_length = 512, num_segmentos = 10):
  
  num_de_muestras_por_segmento = int(num_de_muestras_por_audio / num_segmentos)
  num_de_vectores_mfcc_por_segmento = math.ceil(num_de_muestras_por_segmento / hop_length)

  signal, sr = librosa.load(audio_path, sr=frecuencia_muestra, duration = 30.0)
  
  for segmento in range(num_segmentos):

          inicio_de_muestra = num_de_muestras_por_segmento * segmento 
          fin_de_muestra = inicio_de_muestra + num_de_muestras_por_segmento

          mfcc = librosa.feature.mfcc(signal[inicio_de_muestra: fin_de_muestra],
                                      sr=sr,
                                      n_fft = n_fft,
                                      n_mfcc = n_mfcc,
                                      hop_length = hop_length)
          mfcc = mfcc.T
  return mfcc

In [42]:
def predecir(X, model):
  X = X[np.newaxis, ...]
  prediccion = model.predict(X)
  prediccion = np.argmax(prediccion, axis=1)
  
  return prediccion

In [69]:
def predict(audio_path):
  #Solo predice si el audio tiene una duracion mayor o igual a 30 segundos
  if(duracion >= 30):
    mfcc = procesar_audio(audio_path)

    generos = cargar_generos(data_path)
    prediccion = predecir(mfcc, cargar_modelo())

    print(f"Prediccion de genero: {generos[prediccion[0]]}")

In [73]:
predict("/content/drive/My Drive/Machine Learning TP/Datasets/Audio/Data/genres_original/hiphop/hiphop.00087.wav")





Prediccion de genero: hiphop
