In [1]:
! pip install nlpaug==1.1.10

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting nlpaug==1.1.10
  Downloading nlpaug-1.1.10-py3-none-any.whl (410 kB)
[K     |████████████████████████████████| 410 kB 4.9 MB/s 
Installing collected packages: nlpaug
Successfully installed nlpaug-1.1.10


# Importando bibliotecas

In [2]:
import librosa
import librosa.display
import matplotlib.pyplot as plt
import numpy as np
from IPython.display import Audio
from scipy.io import wavfile

In [4]:
from google.colab import drive
drive.mount('drive')

Mounted at drive


# Rodando Classificador na Original

In [5]:
import pandas as pd
train = pd.read_pickle('/content/drive/My Drive/Colab Notebooks/audio_ml/datasets/superb_ic_original.pkl')

In [6]:
test = pd.read_pickle('/content/drive/My Drive/Colab Notebooks/audio_ml/datasets/superb_ic_test.pkl')

In [7]:
val = pd.read_pickle('/content/drive/My Drive/Colab Notebooks/audio_ml/datasets/superb_ic_validation.pkl')

In [8]:
from tqdm.auto import tqdm
tqdm.pandas()

# Retira Mel Spectograma

In [9]:
def get_mel_spectrogram(audio_dict, max_padding=0, n_fft=128, hop_length=64, n_mels=32, duracao=6):
      # Load audio file
      if audio_dict['array'].shape[0] <= duracao*16000:
        y = audio_dict['array']
      else:
        y = audio_dict['array'][:duracao*16000]
        
      sr = audio_dict['sampling_rate']

      # Normalize audio data between -1 and 1
      normalized_y = librosa.util.normalize(y)

      # Generate mel scaled filterbanks
      mel = librosa.feature.melspectrogram(normalized_y, sr=sr, n_mels=n_mels)

      # Convert sound intensity to log amplitude:
      mel_db = librosa.amplitude_to_db(abs(mel))

      # Normalize between -1 and 1
      normalized_mel = librosa.util.normalize(mel_db)

      # Should we require padding
      shape = normalized_mel.shape[1]
      if (max_padding > 0 & shape < max_padding):
          xDiff = max_padding - shape
          xLeft = xDiff//2
          xRight = xDiff-xLeft
          normalized_mel = np.pad(normalized_mel, pad_width=((0,0), (xLeft, xRight)), mode='constant')
      return normalized_mel

In [10]:


train['mel_spec'] = train['audio'].progress_apply(lambda x:get_mel_spectrogram( x, duracao=6))

  0%|          | 0/2314 [00:00<?, ?it/s]

In [11]:
train.head()

Unnamed: 0,file,audio,label,mel_spec
9276,/root/.cache/huggingface/datasets/downloads/ex...,{'path': '/root/.cache/huggingface/datasets/do...,4,"[[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1..."
14057,/root/.cache/huggingface/datasets/downloads/ex...,{'path': '/root/.cache/huggingface/datasets/do...,1,"[[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1..."
2795,/root/.cache/huggingface/datasets/downloads/ex...,{'path': '/root/.cache/huggingface/datasets/do...,3,"[[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1..."
13856,/root/.cache/huggingface/datasets/downloads/ex...,{'path': '/root/.cache/huggingface/datasets/do...,1,"[[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1..."
21285,/root/.cache/huggingface/datasets/downloads/ex...,{'path': '/root/.cache/huggingface/datasets/do...,5,"[[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1..."


In [12]:
test['mel_spec'] = test['audio'].progress_apply(lambda x:get_mel_spectrogram( x, duracao=6))
val['mel_spec'] = val['audio'].progress_apply(lambda x:get_mel_spectrogram( x, duracao=6))

  0%|          | 0/3793 [00:00<?, ?it/s]

  0%|          | 0/3118 [00:00<?, ?it/s]

In [13]:
train['mel_spec_frames'] = train['mel_spec'].progress_apply(lambda x:x.shape[1])
train['mel_spec_frames'].describe()

  0%|          | 0/2314 [00:00<?, ?it/s]

count    2314.000000
mean       71.660328
std        19.439009
min        27.000000
25%        59.000000
50%        70.000000
75%        82.000000
max       188.000000
Name: mel_spec_frames, dtype: float64

In [14]:
import nlpaug.augmenter.spectrogram as nas
import nlpaug.flow as naf
import random
import numpy as np

In [15]:
def specaug(mel_spec, naug):

  res = []

  minimo= [0.05,0.1,0.15,0.2,0.25]
  maximo = [0.95,0.9,0.85,0.8,0.75]

  for i in range(naug):
    flow = naf.Sequential([
    nas.FrequencyMaskingAug(zone=(random.sample(minimo,1)[0], random.sample(maximo,1)[0]), factor=(4,32)), 
    nas.TimeMaskingAug(zone=(random.sample(minimo,1)[0], random.sample(maximo,1)[0])),
    # nas.LoudnessAug(),
    ])
    perturbed = flow.augment(mel_spec)
    resp = np.array(perturbed).astype(np.float32)
    res.append(resp)
  return res

In [16]:
train['mel_spec_aug'] = train['mel_spec'].progress_apply(lambda x:specaug(x, 10))

  0%|          | 0/2314 [00:00<?, ?it/s]

In [17]:
train = train.explode('mel_spec_aug').reset_index(drop=True)
train = train[['mel_spec_aug', 'label']].copy()
train.rename(columns={'mel_spec_aug':'mel_spec'}, inplace=True)

In [18]:
train['mel_spec_frames'] = train['mel_spec'].progress_apply(lambda x:x.shape[1])
max_frames = int(train['mel_spec_frames'].max())
max_frames

  0%|          | 0/23140 [00:00<?, ?it/s]

188

In [19]:
def add_padding(features, max_padding=174):
    padded = []

    # Add padding
    for i in range(len(features)):
        px = features[i]
        size = len(px[0])
        # Add padding if required
        if (size < max_padding):
            xDiff = max_padding - size
            xLeft = xDiff//2
            xRight = xDiff-xLeft
            px = np.pad(px, pad_width=((0,0), (xLeft, xRight)), mode='constant')
        
        padded.append(px)

    return padded

In [20]:
padded_train = add_padding(train['mel_spec'].to_list(), max_frames)
padded_test = add_padding(test['mel_spec'].to_list(), max_frames)
padded_val = add_padding(val['mel_spec'].to_list(), max_frames)

In [21]:
X_train = np.array(padded_train)
y_train = np.array(train['label'].values)

X_test = np.array(padded_test)
y_test = np.array(test['label'].values)

X_val = np.array(padded_val)
y_val = np.array(val['label'].values)

In [22]:
from keras import backend as keras_backend
from keras.models import Sequential, load_model
from keras.layers import Dense, Dropout, Flatten, LeakyReLU, SpatialDropout2D, Activation, Conv2D, MaxPooling2D, BatchNormalization, GlobalAveragePooling2D
from tensorflow.keras.optimizers import Adam 
from keras.utils import np_utils
from tensorflow.keras.utils import to_categorical
from keras.callbacks import ModelCheckpoint 
from keras.regularizers import l2

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, precision_recall_fscore_support, confusion_matrix

In [23]:
le = LabelEncoder()
y_train_encoded = to_categorical(le.fit_transform(y_train))
y_test_encoded = to_categorical(le.transform(y_test))
y_val_encoded = to_categorical(le.transform(y_val))

In [24]:
import gc

del padded_train, padded_test, y_test, y_train, train, test

gc.collect()

66

In [25]:
# Print status
print("X test shape: {} \t X train shape: {}".format(X_test.shape, X_train.shape))

X test shape: (3793, 32, 188) 	 X train shape: (23140, 32, 188)


In [27]:
# How data should be structured
num_rows = 32
num_columns = 188 
num_channels = 1

# Reshape to fit the network input (channel last)
X_train = X_train.reshape(X_train.shape[0], num_rows, num_columns, num_channels)
X_test = X_test.reshape(X_test.shape[0], num_rows, num_columns, num_channels)
X_val = X_val.reshape(X_val.shape[0], num_rows, num_columns, num_channels)

# Total number of labels to predict (equal to the network output nodes)
num_labels = y_train_encoded.shape[1]

# Model

In [28]:
def create_model():

    # Create a secquential object
    model = Sequential()


    # Conv 1
    model.add(Conv2D(filters=32, 
                     kernel_size=(3, 3), 
                     input_shape=(num_rows, num_columns, num_channels)))
    model.add(LeakyReLU(alpha=0.1))
    model.add(BatchNormalization())

    model.add(Conv2D(filters=32, 
                     kernel_size=(3, 3)))
    model.add(LeakyReLU(alpha=0.1))
    model.add(BatchNormalization())


    # Max Pooling #1
    model.add(MaxPooling2D(pool_size=(2, 2)))

    model.add(Conv2D(filters=64, 
                     kernel_size=(3, 3)))
    model.add(LeakyReLU(alpha=0.1))
    model.add(BatchNormalization())

    model.add(Conv2D(filters=64, 
                     kernel_size=(3,3)))
    model.add(LeakyReLU(alpha=0.1))
    model.add(BatchNormalization())
    
   
    # Reduces each h×w feature map to a single number by taking the average of all h,w values.
    model.add(GlobalAveragePooling2D())


    # Softmax output
    model.add(Dense(num_labels, activation='softmax'))
    
    return model


model = create_model()

In [29]:
model.compile(
    loss='categorical_crossentropy',  # duas classes
    metrics=['accuracy'],  
    optimizer='adam')

# Display model architecture summary 
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 conv2d (Conv2D)             (None, 30, 186, 32)       320       
                                                                 
 leaky_re_lu (LeakyReLU)     (None, 30, 186, 32)       0         
                                                                 
 batch_normalization (BatchN  (None, 30, 186, 32)      128       
 ormalization)                                                   
                                                                 
 conv2d_1 (Conv2D)           (None, 28, 184, 32)       9248      
                                                                 
 leaky_re_lu_1 (LeakyReLU)   (None, 28, 184, 32)       0         
                                                                 
 batch_normalization_1 (Batc  (None, 28, 184, 32)      128       
 hNormalization)                                        

In [30]:
history = model.fit(X_train, 
                    y_train_encoded, 
                    batch_size=8, 
                    epochs=5, 
                    validation_data=(X_val,y_val_encoded),
                    verbose=1)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [31]:
y_probs = model.predict(X_test, verbose=0)

# Get predicted labels
yhat_probs = np.argmax(y_probs, axis=1)
y_trues = np.argmax(y_test_encoded, axis=1)

In [32]:
from sklearn.metrics import accuracy_score
accuracy_score(y_trues, yhat_probs)

0.533350909570261

In [33]:
from sklearn.metrics import f1_score
f1_score(y_trues, yhat_probs, average='macro')

0.5335738365077755