# Importando bibliotecas

In [1]:
import librosa
import librosa.display
import matplotlib.pyplot as plt
import numpy as np
from IPython.display import Audio
from scipy.io import wavfile

In [2]:
from google.colab import drive
drive.mount('drive')

Mounted at drive


# Rodando Classificador na Original

In [3]:
import pandas as pd
train = pd.read_pickle('/content/drive/My Drive/Colab Notebooks/audio_ml/datasets/superb_ks_add_noise.pkl')
# train = train.sample(frac=0.6).sample(frac=1).copy()

In [4]:
test = pd.read_pickle('/content/drive/My Drive/Colab Notebooks/audio_ml/datasets/superb_ks_test.pkl')

In [5]:
val = pd.read_pickle('/content/drive/My Drive/Colab Notebooks/audio_ml/datasets/superb_ks_validation.pkl')

# Retira Mel Spectograma

In [6]:
def get_mel_spectrogram(audio_dict, max_padding=0, n_fft=128, hop_length=64, n_mels=32):
      # Load audio file
      if audio_dict['array'].shape[0] <= 16000:
        y = audio_dict['array']
      else:
        y = audio_dict['array'][:16000]
        
      sr = audio_dict['sampling_rate']

      # Normalize audio data between -1 and 1
      normalized_y = librosa.util.normalize(y)

      # Generate mel scaled filterbanks
      mel = librosa.feature.melspectrogram(normalized_y, sr=sr, n_mels=n_mels)

      # Convert sound intensity to log amplitude:
      mel_db = librosa.amplitude_to_db(abs(mel))

      # Normalize between -1 and 1
      normalized_mel = librosa.util.normalize(mel_db)

      # Should we require padding
      shape = normalized_mel.shape[1]
      if (max_padding > 0 & shape < max_padding):
          xDiff = max_padding - shape
          xLeft = xDiff//2
          xRight = xDiff-xLeft
          normalized_mel = np.pad(normalized_mel, pad_width=((0,0), (xLeft, xRight)), mode='constant')
      return normalized_mel

In [7]:
from tqdm.auto import tqdm
tqdm.pandas()

train['mel_spec'] = train['audio'].progress_apply(lambda x:get_mel_spectrogram( x))

  0%|          | 0/56210 [00:00<?, ?it/s]

In [8]:
train.head()

Unnamed: 0,file,audio,label,mel_spec
45360,/root/.cache/huggingface/datasets/downloads/ex...,{'path': '/root/.cache/huggingface/datasets/do...,2,"[[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1..."
17229,/root/.cache/huggingface/datasets/downloads/ex...,{'path': '/root/.cache/huggingface/datasets/do...,11,"[[-1.0, -1.0, -0.59177244, 0.18149213, 0.38271..."
41712,/root/.cache/huggingface/datasets/downloads/ex...,{'path': '/root/.cache/huggingface/datasets/do...,11,"[[0.22222887, 0.32569015, 0.35693356, 0.223634..."
2464,/root/.cache/huggingface/datasets/downloads/ex...,{'path': '/root/.cache/huggingface/datasets/do...,11,"[[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1..."
21161,/root/.cache/huggingface/datasets/downloads/ex...,{'path': '/root/.cache/huggingface/datasets/do...,11,"[[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1..."


In [9]:
test['mel_spec'] = test['audio'].progress_apply(lambda x:get_mel_spectrogram( x))
val['mel_spec'] = val['audio'].progress_apply(lambda x:get_mel_spectrogram( x))

  0%|          | 0/3081 [00:00<?, ?it/s]

  0%|          | 0/6798 [00:00<?, ?it/s]

In [10]:
train['mel_spec_frames'] = train['mel_spec'].progress_apply(lambda x:x.shape[1])
max_frames = int(train['mel_spec_frames'].max())
max_frames

  0%|          | 0/56210 [00:00<?, ?it/s]

32

In [11]:
def add_padding(features, max_padding=174):
    padded = []

    # Add padding
    for i in range(len(features)):
        px = features[i]
        size = len(px[0])
        # Add padding if required
        if (size < max_padding):
            xDiff = max_padding - size
            xLeft = xDiff//2
            xRight = xDiff-xLeft
            px = np.pad(px, pad_width=((0,0), (xLeft, xRight)), mode='constant')
        
        padded.append(px)

    return padded

In [12]:
padded_train = add_padding(train['mel_spec'].to_list(), max_frames)
padded_test = add_padding(test['mel_spec'].to_list(), max_frames)
padded_val = add_padding(val['mel_spec'].to_list(), max_frames)

In [13]:
X_train = np.array(padded_train)
y_train = np.array(train['label'].values)

X_test = np.array(padded_test)
y_test = np.array(test['label'].values)

X_val = np.array(padded_val)
y_val = np.array(val['label'].values)

In [14]:
from keras import backend as keras_backend
from keras.models import Sequential, load_model
from keras.layers import Dense, Dropout, Flatten, LeakyReLU, SpatialDropout2D, Activation, Conv2D, MaxPooling2D, BatchNormalization, GlobalAveragePooling2D
from tensorflow.keras.optimizers import Adam 
from keras.utils import np_utils
from tensorflow.keras.utils import to_categorical
from keras.callbacks import ModelCheckpoint 
from keras.regularizers import l2

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, precision_recall_fscore_support, confusion_matrix

In [15]:
le = LabelEncoder()
y_test_encoded = to_categorical(le.fit_transform(y_test))
y_train_encoded = to_categorical(le.fit_transform(y_train))
y_val_encoded = to_categorical(le.transform(y_val))

In [16]:
import gc

del padded_train, padded_test, y_test, y_train, train, test

gc.collect()

44

In [17]:
# Print status
print("X test shape: {} \t X train shape: {}".format(X_test.shape, X_train.shape))

X test shape: (3081, 32, 32) 	 X train shape: (56210, 32, 32)


In [18]:
# How data should be structured
num_rows = 32
num_columns = 32 
num_channels = 1

# Reshape to fit the network input (channel last)
X_train = X_train.reshape(X_train.shape[0], num_rows, num_columns, num_channels)
X_test = X_test.reshape(X_test.shape[0], num_rows, num_columns, num_channels)
X_val = X_val.reshape(X_val.shape[0], num_rows, num_columns, num_channels)

# Total number of labels to predict (equal to the network output nodes)
num_labels = y_train_encoded.shape[1]

# Model

In [19]:
def create_model():

    # Create a secquential object
    model = Sequential()


    # Conv 1
    model.add(Conv2D(filters=32, 
                     kernel_size=(3, 3), 
                     input_shape=(num_rows, num_columns, num_channels)))
    model.add(LeakyReLU(alpha=0.1))
    model.add(BatchNormalization())

    model.add(Conv2D(filters=32, 
                     kernel_size=(3, 3)))
    model.add(LeakyReLU(alpha=0.1))
    model.add(BatchNormalization())


    # Max Pooling #1
    model.add(MaxPooling2D(pool_size=(2, 2)))

    model.add(Conv2D(filters=64, 
                     kernel_size=(3, 3)))
    model.add(LeakyReLU(alpha=0.1))
    model.add(BatchNormalization())

    model.add(Conv2D(filters=64, 
                     kernel_size=(3,3)))
    model.add(LeakyReLU(alpha=0.1))
    model.add(BatchNormalization())
    
   
    # Reduces each h×w feature map to a single number by taking the average of all h,w values.
    model.add(GlobalAveragePooling2D())


    # Softmax output
    model.add(Dense(num_labels, activation='softmax'))
    
    return model


model = create_model()

In [20]:
model.compile(
    loss='categorical_crossentropy',  # duas classes
    metrics=['accuracy'],  
    optimizer='adam')

# Display model architecture summary 
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 conv2d (Conv2D)             (None, 30, 30, 32)        320       
                                                                 
 leaky_re_lu (LeakyReLU)     (None, 30, 30, 32)        0         
                                                                 
 batch_normalization (BatchN  (None, 30, 30, 32)       128       
 ormalization)                                                   
                                                                 
 conv2d_1 (Conv2D)           (None, 28, 28, 32)        9248      
                                                                 
 leaky_re_lu_1 (LeakyReLU)   (None, 28, 28, 32)        0         
                                                                 
 batch_normalization_1 (Batc  (None, 28, 28, 32)       128       
 hNormalization)                                        

In [21]:
history = model.fit(X_train, 
                    y_train_encoded, 
                    batch_size=8, 
                    epochs=5, 
                    validation_data=(X_val,y_val_encoded),
                    verbose=1)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [22]:
y_probs = model.predict(X_test, verbose=0)

# Get predicted labels
yhat_probs = np.argmax(y_probs, axis=1)
y_trues = np.argmax(y_test_encoded, axis=1)

In [23]:
from sklearn.metrics import accuracy_score
accuracy_score(y_trues, yhat_probs)

0.7935735150925024

In [24]:
from sklearn.metrics import f1_score
f1_score(y_trues, yhat_probs, average='macro')

0.7700725664210948