# Guided Projects Artificial Intelligence & Machine Learning
## Guided Projects: Deep Learning
### Speech – Music Recognition
Deep learning techniques such as RNN or CNNs can be used for classification of an audio 
segment. Given input audio, the trained model can predict the genre or class the audio segment
belongs to which can be used for music/speech analysis. For example, using this approach, 
we can detect the language being spoken or the instrument being played.
### Question:
Use GTZAN music genre dataset for classifying the song genre. Train a RNN that takes a song 
as an input and predicts the corresponding genre for the song. For implementation purposes, 
you can just use a specific time duration of the song as input (eg: just the first 10-20 seconds 
of each song). The dataset has a total of 10 classes, you may train for any 5 or all the classes 
using 90% of available data for training and rest for the test dataset.

In [1]:
# Import Libraries
import pandas as pd
import numpy as np
from sklearn import preprocessing
from tensorflow.keras.utils import to_categorical
from sklearn.model_selection import train_test_split
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from sklearn.metrics import classification_report

In [2]:
# Import Dataset
data = pd.read_csv('../21. Speech Music Recognition/Data/features_30_sec.csv')
data.head()

Unnamed: 0,filename,length,chroma_stft_mean,chroma_stft_var,rms_mean,rms_var,spectral_centroid_mean,spectral_centroid_var,spectral_bandwidth_mean,spectral_bandwidth_var,...,mfcc16_var,mfcc17_mean,mfcc17_var,mfcc18_mean,mfcc18_var,mfcc19_mean,mfcc19_var,mfcc20_mean,mfcc20_var,label
0,blues.00000.wav,661794,0.350088,0.088757,0.130228,0.002827,1784.16585,129774.064525,2002.44906,85882.761315,...,52.42091,-1.690215,36.524071,-0.408979,41.597103,-2.303523,55.062923,1.221291,46.936035,blues
1,blues.00001.wav,661794,0.340914,0.09498,0.095948,0.002373,1530.176679,375850.073649,2039.036516,213843.755497,...,55.356403,-0.731125,60.314529,0.295073,48.120598,-0.283518,51.10619,0.531217,45.786282,blues
2,blues.00002.wav,661794,0.363637,0.085275,0.17557,0.002746,1552.811865,156467.643368,1747.702312,76254.192257,...,40.598766,-7.729093,47.639427,-1.816407,52.382141,-3.43972,46.63966,-2.231258,30.573025,blues
3,blues.00003.wav,661794,0.404785,0.093999,0.141093,0.006346,1070.106615,184355.942417,1596.412872,166441.494769,...,44.427753,-3.319597,50.206673,0.636965,37.31913,-0.619121,37.259739,-3.407448,31.949339,blues
4,blues.00004.wav,661794,0.308526,0.087841,0.091529,0.002303,1835.004266,343399.939274,1748.172116,88445.209036,...,86.099236,-5.454034,75.269707,-0.916874,53.613918,-4.404827,62.910812,-11.703234,55.19516,blues


In [3]:
# Remove Filename and Length. Save to X and Y
data.pop('filename')
data.pop('length')
Y = pd.DataFrame(data.pop('label'))
X = data

In [4]:
X.head()

Unnamed: 0,chroma_stft_mean,chroma_stft_var,rms_mean,rms_var,spectral_centroid_mean,spectral_centroid_var,spectral_bandwidth_mean,spectral_bandwidth_var,rolloff_mean,rolloff_var,...,mfcc16_mean,mfcc16_var,mfcc17_mean,mfcc17_var,mfcc18_mean,mfcc18_var,mfcc19_mean,mfcc19_var,mfcc20_mean,mfcc20_var
0,0.350088,0.088757,0.130228,0.002827,1784.16585,129774.064525,2002.44906,85882.761315,3805.839606,901505.4,...,0.75274,52.42091,-1.690215,36.524071,-0.408979,41.597103,-2.303523,55.062923,1.221291,46.936035
1,0.340914,0.09498,0.095948,0.002373,1530.176679,375850.073649,2039.036516,213843.755497,3550.522098,2977893.0,...,0.927998,55.356403,-0.731125,60.314529,0.295073,48.120598,-0.283518,51.10619,0.531217,45.786282
2,0.363637,0.085275,0.17557,0.002746,1552.811865,156467.643368,1747.702312,76254.192257,3042.260232,784034.5,...,2.45169,40.598766,-7.729093,47.639427,-1.816407,52.382141,-3.43972,46.63966,-2.231258,30.573025
3,0.404785,0.093999,0.141093,0.006346,1070.106615,184355.942417,1596.412872,166441.494769,2184.745799,1493194.0,...,0.780874,44.427753,-3.319597,50.206673,0.636965,37.31913,-0.619121,37.259739,-3.407448,31.949339
4,0.308526,0.087841,0.091529,0.002303,1835.004266,343399.939274,1748.172116,88445.209036,3579.757627,1572978.0,...,-4.520576,86.099236,-5.454034,75.269707,-0.916874,53.613918,-4.404827,62.910812,-11.703234,55.19516


In [5]:
Y.head()

Unnamed: 0,label
0,blues
1,blues
2,blues
3,blues
4,blues


In [6]:
# Normalize the data
min_max = preprocessing.MinMaxScaler()
scaled_df = min_max.fit_transform(X.values)
final_df = pd.DataFrame(scaled_df,columns=X.columns)
final_df.head()

Unnamed: 0,chroma_stft_mean,chroma_stft_var,rms_mean,rms_var,spectral_centroid_mean,spectral_centroid_var,spectral_bandwidth_mean,spectral_bandwidth_var,rolloff_mean,rolloff_var,...,mfcc16_mean,mfcc16_var,mfcc17_mean,mfcc17_var,mfcc18_mean,mfcc18_var,mfcc19_mean,mfcc19_var,mfcc20_mean,mfcc20_var
0,0.362279,0.695468,0.318188,0.101983,0.314117,0.040233,0.422879,0.109789,0.385532,0.102567,...,0.564186,0.112704,0.541287,0.057615,0.422444,0.08245,0.48795,0.109503,0.599189,0.078255
1,0.343622,0.793392,0.230894,0.08558,0.248405,0.121475,0.436889,0.296867,0.353329,0.342717,...,0.570198,0.120353,0.574685,0.118286,0.448184,0.102997,0.5488,0.099081,0.57964,0.075947
2,0.389832,0.640692,0.433652,0.099064,0.254261,0.049046,0.325334,0.095712,0.289224,0.088981,...,0.622467,0.081898,0.331003,0.085962,0.370988,0.11642,0.453724,0.087317,0.501381,0.045405
3,0.473508,0.777954,0.345856,0.22916,0.129376,0.058253,0.267404,0.227566,0.181068,0.171001,...,0.565151,0.091876,0.484549,0.092509,0.460684,0.068975,0.53869,0.062612,0.46806,0.048168
4,0.277759,0.681062,0.219641,0.083075,0.32727,0.110761,0.325514,0.113536,0.357017,0.180228,...,0.383289,0.200462,0.410225,0.156424,0.403875,0.1203,0.424652,0.130173,0.233047,0.094836


In [7]:
# Convert Labels to one-Hot codes
cols = Y['label'].unique()
Y_encode = np.array(Y)
for i, item in enumerate(cols):
    ind = np.where(Y_encode == item)[0]
    Y_encode[ind] = i
Y_encode = pd.DataFrame(to_categorical(Y_encode))
Y_encode.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [8]:
# Split the Data into train and test
x_train, x_test, y_train, y_test = train_test_split(final_df, Y_encode, train_size = 0.9)
print('Length of x_train is : {}'.format(len(x_train)))
print('Length of y_train is : {}'.format(len(y_train)))
print('Length of x_test is : {}'.format(len(x_test)))
print('Length of y_test is : {}'.format(len(y_test)))

Length of x_train is : 900
Length of y_train is : 900
Length of x_test is : 100
Length of y_test is : 100


In [9]:
x_train_rnn = np.array(x_train)
x_train_rnn = x_train_rnn.reshape(len(x_train_rnn), 1, x_train_rnn.shape[1])
x_train_rnn.shape

(900, 1, 57)

#### Fitting a RNN model

In [37]:
model_rnn = keras.Sequential()
model_rnn.add(layers.LSTM(512, input_shape=(x_train_rnn.shape[1:])))
model_rnn.add(layers.Dense(10, activation="softmax"))
model_rnn.summary()

Model: "sequential_5"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm_3 (LSTM)                (None, 512)               1167360   
_________________________________________________________________
dense_11 (Dense)             (None, 10)                5130      
Total params: 1,172,490
Trainable params: 1,172,490
Non-trainable params: 0
_________________________________________________________________


In [38]:
model_rnn.compile(
  optimizer='adam',
  loss='categorical_crossentropy',
  metrics=['accuracy'],
)

In [39]:
model_rnn.fit(x_train_rnn, y_train, epochs=20, verbose = True)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<tensorflow.python.keras.callbacks.History at 0x1cf75e9c400>

In [40]:
x_test_rnn = np.array(x_test)
x_test_rnn = x_test_rnn.reshape(len(x_test_rnn), 1, x_test_rnn.shape[1])
x_test_rnn.shape

(100, 1, 57)

In [41]:
y_preds_rnn = np.round(model_rnn.predict(x_test_rnn))
y_preds_rnn[:5]

array([[0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 1.],
       [0., 0., 0., 0., 0., 0., 0., 0., 1., 0.]], dtype=float32)

In [42]:
print(classification_report(y_test, np.array(y_preds_rnn), target_names = Y['label'].unique()))

              precision    recall  f1-score   support

       blues       0.80      0.40      0.53        10
   classical       0.89      0.80      0.84        10
     country       0.40      0.25      0.31         8
       disco       0.50      0.08      0.14        12
      hiphop       0.88      0.47      0.61        15
        jazz       0.79      0.92      0.85        12
       metal       1.00      0.88      0.93         8
         pop       0.67      0.33      0.44         6
      reggae       0.54      0.70      0.61        10
        rock       1.00      0.33      0.50         9

   micro avg       0.75      0.52      0.62       100
   macro avg       0.75      0.52      0.58       100
weighted avg       0.75      0.52      0.58       100
 samples avg       0.52      0.52      0.52       100



  _warn_prf(average, modifier, msg_start, len(result))


#### Fitting regular model

In [43]:
# Build the regular model
model = keras.Sequential()
model.add(layers.Input(shape = (np.array(x_train).shape[1],)))
model.add(layers.Dense(512, activation="relu"))
model.add(layers.Dense(512, activation="relu"))
model.add(layers.Dense(512, activation="relu"))
model.add(layers.Dense(10, activation="softmax"))

model.summary()

Model: "sequential_6"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_12 (Dense)             (None, 512)               29696     
_________________________________________________________________
dense_13 (Dense)             (None, 512)               262656    
_________________________________________________________________
dense_14 (Dense)             (None, 512)               262656    
_________________________________________________________________
dense_15 (Dense)             (None, 10)                5130      
Total params: 560,138
Trainable params: 560,138
Non-trainable params: 0
_________________________________________________________________


In [44]:
model.compile(
  optimizer='adam',
  loss='categorical_crossentropy',
  metrics=['accuracy'],
)

In [45]:
model.fit(x_train, y_train, epochs=20, verbose = True)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<tensorflow.python.keras.callbacks.History at 0x1cf7af42280>

In [46]:
y_preds = np.round(model.predict(x_test))
y_preds[:5]

array([[0., 0., 0., 0., 1., 0., 0., 0., 0., 0.],
       [0., 0., 0., 1., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 1.],
       [0., 0., 0., 0., 0., 0., 0., 0., 1., 0.]], dtype=float32)

In [47]:
print(classification_report(y_test, np.array(y_preds), target_names = Y['label'].unique()))

              precision    recall  f1-score   support

       blues       0.75      0.90      0.82        10
   classical       1.00      0.80      0.89        10
     country       0.54      0.88      0.67         8
       disco       0.78      0.58      0.67        12
      hiphop       0.80      0.53      0.64        15
        jazz       0.92      0.92      0.92        12
       metal       1.00      0.62      0.77         8
         pop       0.62      0.83      0.71         6
      reggae       0.78      0.70      0.74        10
        rock       1.00      0.22      0.36         9

   micro avg       0.78      0.69      0.73       100
   macro avg       0.82      0.70      0.72       100
weighted avg       0.83      0.69      0.72       100
 samples avg       0.69      0.69      0.69       100



  _warn_prf(average, modifier, msg_start, len(result))
