In [23]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
import matplotlib.pyplot as plt
from keras import models
import pandas as pd
import pywt
import numpy as np
from scipy.signal import hilbert, sosfilt, sosfreqz, butter
import librosa
import tensorflow as tf

In [3]:
'''
FEATURES
'''

def compute_spectrogram(audio, sr, n_fft=2048, hop_length=512):
    S = np.abs(librosa.stft(audio, n_fft=n_fft, hop_length=hop_length))
    spectrogram = librosa.amplitude_to_db(S, ref=np.max)
    return spectrogram


def compute_wavelet_transform(audio, wavelet='db4', level=5):
    coeffs = pywt.wavedec(audio, wavelet, level=level)
    coeffs_flat = np.concatenate([c.ravel
     () for c in coeffs])
    return coeffs_flat

def compute_hilbert_envelope(audio):
    analytic_signal = hilbert(audio)
    amplitude_envelope = np.abs(analytic_signal)
    return amplitude_envelope

def compute_homomorphic_envelope(audio, sr, low_freq=5, high_freq=40):
    # Logarithmic transformation
    # Adding a small value to avoid taking log of zero
    log_audio = np.log(audio + 1e-6)

    # Design a bandpass filter in the range [low_freq, high_freq]
    sos = butter(N=4, Wn=[low_freq/sr*2, high_freq/sr*2], btype='band', output='sos')

    # Apply the bandpass filter
    filtered_signal = sosfilt(sos, log_audio)

    # Exponential to get the envelope
    homomorphic_envelope = np.exp(filtered_signal) - 1

    return homomorphic_envelope

def create_feature_dictionary(y, sr):
    features = {}
    #features['spectrogram'] = compute_spectrogram(y, sr)
    features['wavelet_transform'] = compute_wavelet_transform(y)
    features['hilbert_envelope'] = compute_hilbert_envelope(y)
    features['homomorphic_envelope'] = compute_homomorphic_envelope(np.abs(y), sr)
    return features

In [4]:
def plot_features(feature,feature_array):

  #audio.shape = (44100,)
  #feature_array.shape = (44100,)

  plt.figure(figsize=(10, 3))

  if feature=="wavelet_transform":
    f = plt.plot(feature_array)
    plt.title('Wavelet Transform')
    plt.xlabel('Coefficient Index')
    plt.ylabel('Coefficient Value')
    plt.savefig("/content/wavelet_transform.png")
    plt.cla()
    plt.clf()
    plt.close('f')
    plt.close('all')

  elif feature=="hilbert_envelope":
    f = plt.plot(time, feature_array)
    plt.title('Hilbert Envelope')
    plt.xlabel('Time (s)')
    plt.ylabel('Envelope Amplitude')
    plt.savefig("/content/hilbert_envelope.png")
    plt.cla()
    plt.clf()
    plt.close('f')
    plt.close('all')

  elif feature=="homomorphic_envelope":
    f = plt.plot(time, feature_array)
    plt.title('Homomorphic Envelope')
    plt.xlabel('Time (s)')
    plt.ylabel('Envelope Amplitude')
    plt.savefig("/content/homomorphic_envelope.png")
    plt.cla()
    plt.clf()
    plt.close('f')
    plt.close('all')


  plt.cla()
  plt.clf()
  plt.close('all')

In [7]:
sample_rate = 44100
audio_path = "/content/drive/MyDrive/FSDKaggle2018.audio_train/00353774.wav"
y,sr = librosa.load(audio_path,sr=sample_rate)
y = y = y/np.max(np.abs(y))

time = np.linspace(0,len(y[:3*44100])/sr,num=len(y[:3*44100]))
f = plt.figure(figsize=(10, 3))
plt.plot(time, y[:3*44100])
plt.title('Audio Waveform')
plt.xlabel('Time (s)')
plt.ylabel('Amplitude')
plt.savefig("00353774wav.png")
plt.close(f)

features = create_feature_dictionary(y[:3*44100],sr)
features

{'wavelet_transform': array([-0.07051754, -0.0712806 , -0.07100114, ..., -0.00042168,
         0.00019684,  0.00018053], dtype=float32),
 'hilbert_envelope': array([0.11893722, 0.03894236, 0.04299807, ..., 0.1183101 , 0.12455299,
        0.1677107 ], dtype=float32),
 'homomorphic_envelope': array([-1.65338965e-10, -1.48841561e-09, -6.77770451e-09, ...,
        -2.06149906e-01, -2.06336104e-01, -2.06521777e-01])}

In [8]:
for feature in features.keys():
  plot_features(feature=feature,feature_array=features[feature])

In [12]:
def get_input(y):
  audio_features = []
  input = []

  audio_features.append(y[:3*44100])

  for feature in features.keys():
    feature_array = features[feature]
    feature_array = feature_array[:3*44100]
    audio_features.append(feature_array)

  audio_features = np.stack(audio_features, axis=1)
  audio_features = audio_features.T

  input.append(audio_features)

  return(input)

In [13]:
model = models.load_model('/content/Conv1DWaveFeatures.h5')
model.layers

[<keras.src.layers.convolutional.conv1d.Conv1D at 0x7e5448535b10>,
 <keras.src.layers.pooling.max_pooling1d.MaxPooling1D at 0x7e5448536a40>,
 <keras.src.layers.convolutional.conv1d.Conv1D at 0x7e5448535120>,
 <keras.src.layers.pooling.max_pooling1d.MaxPooling1D at 0x7e5448536dd0>,
 <keras.src.layers.convolutional.conv1d.Conv1D at 0x7e5448536860>,
 <keras.src.layers.pooling.max_pooling1d.MaxPooling1D at 0x7e54485358d0>,
 <keras.src.layers.convolutional.conv1d.Conv1D at 0x7e5448535840>,
 <keras.src.layers.pooling.max_pooling1d.MaxPooling1D at 0x7e5448536bc0>,
 <keras.src.layers.reshaping.flatten.Flatten at 0x7e5447c990f0>,
 <keras.src.layers.core.dense.Dense at 0x7e5447c98fa0>,
 <keras.src.layers.regularization.dropout.Dropout at 0x7e5447c985e0>,
 <keras.src.layers.core.dense.Dense at 0x7e5447c983d0>]

In [14]:
input = get_input(y)
input = np.array(input).reshape((1, (3*44100), -1))

for layer_idx in range(len(model.layers)):
  if isinstance(model.layers[layer_idx], (tf.keras.layers.Conv1D)):
    layer_output = tf.keras.models.Model(inputs=model.input, outputs=model.layers[layer_idx].output)
    output = layer_output.predict(input)
    print(output.shape)

    for kernel in range(output.shape[2]):
      f = plt.figure()
      plt.title(f'Conv1DLayer{layer_idx} Kernel {kernel}')
      plt.plot(output[0, :, kernel])
      plt.savefig(f'Conv1DLayer{layer_idx} Kernel {kernel}.png')
      plt.close(f)

(1, 132298, 16)
(1, 66147, 32)
(1, 33071, 64)
(1, 16533, 128)


In [15]:
import os
import zipfile

png_files = [f for f in os.listdir("/content/") if f.endswith('.png')]

zip_file_name = 'plots.zip'
with zipfile.ZipFile(zip_file_name, 'w') as zipf:
    for file in png_files:
        file_path = os.path.join("/content/", file)
        zipf.write(file_path, arcname=file)

from google.colab import files
files.download(zip_file_name)

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [16]:
test_labels = "/content/drive/MyDrive/test_post_competition_scoring_clips.csv"
test_labels_df = pd.read_csv(test_labels)
test_labels_df.head(10)

Unnamed: 0,fname,label,usage,freesound_id,license
0,00326aa9.wav,Oboe,Private,355125,Attribution
1,0038a046.wav,Bass_drum,Private,90621,Creative Commons 0
2,007759c4.wav,Saxophone,Private,13406,Creative Commons 0
3,008afd93.wav,Saxophone,Private,358962,Attribution
4,00ae03f6.wav,Chime,Private,78203,Attribution
5,00eac343.wav,Electric_piano,Public,371494,Creative Commons 0
6,010a0b3a.wav,Shatter,Private,368342,Attribution
7,01a5a2a3.wav,Bark,Private,30344,Attribution
8,01bb344f.wav,Acoustic_guitar,Private,128810,Attribution
9,02107093.wav,Electric_piano,Private,65660,Attribution


In [17]:
from sklearn.preprocessing import LabelEncoder
train_labels_dir =  "/content/drive/MyDrive/FSDKaggle2018.meta/train_post_competition.csv"

train_labels_df = pd.read_csv(train_labels_dir)

label_encoder = LabelEncoder()
train_labels_df['label_encoded'] = label_encoder.fit_transform(train_labels_df['label'])

train_labels_df.head(10)

Unnamed: 0,fname,label,manually_verified,freesound_id,license,label_encoded
0,00044347.wav,Hi-hat,0,28739,Attribution,23
1,001ca53d.wav,Saxophone,1,358827,Attribution,30
2,002d256b.wav,Trumpet,0,10897,Creative Commons 0,38
3,0033e230.wav,Glockenspiel,1,325017,Attribution,19
4,00353774.wav,Cello,1,195688,Attribution,6
5,003b91e8.wav,Cello,0,77944,Attribution,6
6,003da8e5.wav,Knock,1,164564,Creative Commons 0,25
7,0048fd00.wav,Gunshot_or_gunfire,1,274119,Creative Commons 0,21
8,004ad66f.wav,Clarinet,0,248370,Attribution,8
9,0063ab88.wav,Computer_keyboard,0,210304,Creative Commons 0,9


In [18]:
label_encodings = train_labels_df.drop_duplicates(['label']).set_index('label')['label_encoded'].to_dict()
test_labels_df['label_encoded'] = test_labels_df['label'].map(label_encodings)

test_labels_df.head(10)

Unnamed: 0,fname,label,usage,freesound_id,license,label_encoded
0,00326aa9.wav,Oboe,Private,355125,Attribution,29
1,0038a046.wav,Bass_drum,Private,90621,Creative Commons 0,3
2,007759c4.wav,Saxophone,Private,13406,Creative Commons 0,30
3,008afd93.wav,Saxophone,Private,358962,Attribution,30
4,00ae03f6.wav,Chime,Private,78203,Attribution,7
5,00eac343.wav,Electric_piano,Public,371494,Creative Commons 0,14
6,010a0b3a.wav,Shatter,Private,368342,Attribution,32
7,01a5a2a3.wav,Bark,Private,30344,Attribution,2
8,01bb344f.wav,Acoustic_guitar,Private,128810,Attribution,0
9,02107093.wav,Electric_piano,Private,65660,Attribution,14


In [38]:
test_data_dir = "/content/drive/MyDrive/FSDKaggle2018.audio_test"
Y_test = []
X_test = []
batch_size = 0
for index,row in test_labels_df.iterrows():
    fname =  row['fname']
    label = row['label_encoded']

    try:
      audio_path = test_data_dir+"/"+fname
      y,sr = librosa.load(audio_path,sr=sample_rate)

      if(len(y) >= (3*sr)):
        y = y/np.max(np.abs(y))
        y = y[:3*44100]

        X_test.append(get_input(y=y))
        Y_test.append(label)
        batch_size += 1
      else:
        continue
    except:
      pass

X_test = np.array(X_test).reshape((-1, (3*44100), 4))
Y_test = np.array(Y_test).reshape((batch_size,))
print(X_test.shape,Y_test.shape)

  y,sr = librosa.load(audio_path,sr=sample_rate)
	Deprecated as of librosa version 0.10.0.
	It will be removed in librosa version 1.0.
  y, sr_native = __audioread_load(path, offset, duration, dtype)


(362, 132300, 4) (362,)


In [39]:
from sklearn.metrics import precision_score, recall_score, f1_score, confusion_matrix

Y_pred = model.predict(X_test)





In [41]:
Y_pred =  tf.argmax(Y_pred, axis=1)
Y_pred.shape

TensorShape([362])

In [43]:
precision = precision_score(Y_test, Y_pred, average='weighted')
recall = recall_score(Y_test, Y_pred, average='weighted')
f1 = f1_score(Y_test, Y_pred, average='weighted')

print(f'Precision: {precision}')
print(f'Recall: {recall}')
print(f'F1 Score: {f1}')

np.set_printoptions(threshold=np.inf)

# Calculate confusion matrix
conf_matrix = confusion_matrix(Y_test, Y_pred)
print('Confusion Matrix:')
print(conf_matrix)

Precision: 0.058115616126427704
Recall: 0.11049723756906077
F1 Score: 0.06758721637861748
Confusion Matrix:
[[ 0  9  0  0  0  0  1  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
   0  0  0  0  0  1  0  0  0  0  0  0  3  0  0]
 [ 0  9  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
   0  0  0  0  0  0  0  0  0  0  0  0  4  0  0]
 [ 0  6  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
   0  0  0  0  0  1  0  0  0  0  0  0  0  0  0]
 [ 0  1  0  0  0  1  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
   0  0  0  0  0  0  0  0  0  0  0  0  3  0  0]
 [ 0  1  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
   0  0  0  0  0  0  0  0  0  0  0  0  0  0  0]
 [ 0 14  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
   0  0  0  0  0  2  0  0  0  0  0  0  0  0  0]
 [ 0  1  0  0  0  0  1  0  4  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
   0  0  0  0  0 11  0  0  0  0  0  0  0  0  0]
 [ 0  6  0  0  0  0  0  0  1  0  0  0 

  _warn_prf(average, modifier, msg_start, len(result))
