<a href="https://colab.research.google.com/github/markcastorm/Audio_Classification_Deep-Learning/blob/main/Audio_Classification_Deep_Learning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

###**Audio Classification using DeepLearning**

Import and installing Dependancies

In [None]:
!pip install tensorflow==2.4.1 tensorflow-gpu==2.4.1 tensorflow-io matplotlib

####Lets Load Dependencies

In [None]:
import os
from matplotlibimport pyplot as plt
import tensorflow as tf
import tensorflow_io as tfio

####Lets Build a Loading Function

First we define paths to our files

In [None]:
CAPUCHIN_FILE = os.path.join('data', 'Parsed_Capuchinbird_Clips', 'XC3776-3.wav')
NOT_CAPUCHIN_FILE = os.path.join('data', 'Parsed_Not_Capuchinbird_Clips', 'afternoon-birds-song-in-forest-0.wav')

Next part we are gonna build a data loading function

In [None]:
def load_wav_16k_mono(filename):
  #loading the encoded wav file
  file_contents = tf.io.read_file(filename)
  #Decode wav (tensor by Channels)
  wav, sample_rate = tf.audio.decode_wav(file_contents,desired_channels=1)
  #Removing the trailling axis
  wav = tf.squeeze(wav, axis=-1)
  sample_rate = tf.cast(sample_rate, dtype=tf.int64)
  wav = tfio.audio.resample(wav, rate_in sample_rate, rate_out=16000)
  return wav

Lets visualize the wave

In [None]:
wave =  load_wav_16k_mono(CAPUCHIN_FILE)
nwave = load_way_16k-mono(NOT_CAPUCHIN_FILE)

In [None]:
plt.plot(wave)
plt.plot(nwave)
plt.show()

###Now Lets Create A Tensorflow Dataset


First we define the paths to the Positive and Negative Data

In [None]:
POS = os.path.join('data', 'Parsed_Capuchinbird_Clips')
NEG = os.path.join('data', 'Parsed_Not_Capuchinbird_Clips')

Lets make the tensorflow Datasets

In [None]:
pos = tf.data.Dataset.list_files(POS+'\*.wav')
neg = tf.data.Dataset.list_files(NEG+'\*.wav')

Adding labels and Combine Positive and Negative Data Samples

In [None]:
positives = tf.data.Dataset.zip((pos, tf.data.Dataset.from_tensor_slices(tf.ones(len(pos)))))
negatives = tf.data.Dataset.zip((neg, tf.data.Dataset.from_tensor_slices(tf.zeros(len(neg)))))
data = positives.concatenate(negatives)


Now Lets determine the Average Length of Capuchin Capuchin Call

Lets calculate the wave cycle length

In [None]:
lengths = []
for file in os.listdir(os.path.join('data', 'Parsed_Capuchinbird_Clips')):
    tensor_wave = load_wav_16k_mono(os.path.join('data', 'Parsed_Capuchinbird_Clips', file))
    lengths.append(len(tensor_wave))

Here we are calculating Mean, Min and Max

In [None]:
 tf.math.reduce_mean(lengths)
tf.math.reduce_min(lengths)
tf.math.reduce_max(lengths)

### Now lets Build a Processing Function to Convert to a Spectogram


First off lets start by building the Preprocessing Function

In [None]:
def preprocess(file_path, label):
    wav = load_wav_16k_mono(file_path)
    wav = wav[:48000]
    zero_padding = tf.zeros([48000] - tf.shape(wav), dtype=tf.float32)
    wav = tf.concat([zero_padding, wav],0)
    spectrogram = tf.signal.stft(wav, frame_length=320, frame_step=32)
    spectrogram = tf.abs(spectrogram)
    spectrogram = tf.expand_dims(spectrogram, axis=2)
    return spectrogram, label


Lets tests out the function

In [None]:
filepath, label = positives.shuffle(buffer_size=10000).as_numpy_iterator().next()


In [None]:
spectrogram, label = preprocess(filepath, label)

In [None]:
plt.figure(figsize=(30,20))
plt.imshow(tf.transpose(spectrogram)[0])
plt.show()

### Creating the testing and training Partition

Lets create a Tensorflow Data Pipeline

In [None]:
data = data.map(preprocess)
data = data.cache()
data =shuffle(buffer_size=1000)
data = data.batch(16)
data = data.prefetch(8)

Lets split the data into train and test sets

In [None]:
samples, labels = train.as_numpy_iterator().next()


In [None]:
samples.shape

##Building the Deep Learning Models

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv2D, Dense, Flatten

Now lets build the sequential Model

In [None]:
model = sequential()
model.add(Conv2D(16, (3,3), activation='relu', input_shape=(1491, 257,1)))
model.add(Conv2D(16, (3,3), activation='relu'))
model.add(Flatten())
model.add(Dense(128, activation='relu'))
model.add(Dense(1, activation='sigmoid'))

In [None]:
model.compile('Adam', loss='BinaryCrossentropy', metrics=[tf.keras.metrics.Recall(),tf.keras.metrics.Precision()])

In [None]:
model.summary()

Fitting the Model

In [None]:
hist = model.fit(train, epochs=4, validation_data=test)

In [None]:
plt.title('Loss')
plt.plot(hist.history['loss'],'r')
plt.plot(hist.history['val_loss'], 'b')
plt.show()

In [None]:
plt.title('Precision')
plt.plot(hist.history['precision'],'r')
plt.plot(hist.history['val_precision'], 'b')
plt.show()

In [None]:
plt.title('Recall')
plt.plot(hist.history['recall'],'r')
plt.plot(hist.history['val_recall'], 'b')
plt.show()

Lets test out our modell and make a single prediction

In [None]:
X_test, y_test = test.as_numpy_iterator().next()

In [None]:
yhat = model.predict(X_test)

Now how about we convert the logits to classes

In [None]:
yhat = [1 if prediction > 0.5 else 0 for prediction in yhat]

Building a Forest Parsing Function

Lets load up the MP3s

In [None]:
def load_mp3_16k_mono(filename):
  res - tfio.audio.AudioIOTensor(filename)
  tensor = res,to_tensor()
  tensor = tf.math.reduce_sum(tensor, axis=1) / 2
  sample_rate = res.rate
  sample_rate = tf.cast(sample_rate, dtype=tf.int64)
  wav = tfio.audio.resample(tensor, rate_sample_rate, rate_out=16000)
  return wav

In [None]:
mp3 = os.path.join('data', 'Forest Recordings', 'recording_00.mp3')

In [None]:
wav = load_mp3_16k_mono(mp3)

In [None]:
audio_slices = tf.keras.utils.timeseries_dataset_from_array(wav, wav, sequence_length=48000, sequence_stride=48000, batch_size=1)

In [None]:
samples,index  = audio_slices.as_numpy_iterator().next()

Lets build a function to convert clips into windowed spectrograms

In [None]:
def preprocess_mp3(sample, index):
  sample = sample[0]
  zero_padding = tf.zeros([48000] - tf.shape(sample), dtype=tf.float32)
  wav = tf.concat([zero_padding, sample], 0)
  spectrogram = tf.signal.stft(wav, frame_length=320, frame_step=32)
  spectrogram = tf.abs(spectrogram)
  spectrogram = tf.expand_dims(spectrogam, axis=2)
  return spectrogram

Lets do the convertion and make predictions

In [None]:
audio_slices = tf.keras.utils.timeseries_dataset_from_array(wav, wav, sequence_length=16000, sequence_stride=16000, batch_size=1)
audio_slices = audio_slices.map(preprocess_mp3)
audio_slices = audio_slices.batch(64)

In [None]:
yhat = model.predict(audio_slices)
yhat = [1 if prediction > 0.5 else 0 for prediction in yhat]

Now lets group the predictions

In [None]:
from itertools import groupby

yhat = [key for key, group in groupby(yhat)]
calls = tf.math.reduce_sum(yhat).numpy()

In [None]:
calls

Making a loop to go through the recording and making predictions

In [None]:
result = {}
for file in os.listdir(os.path.join('data', 'Forest Recordings')):
  FILEPATH = os.path.join('data', 'Forest Recordings', file)

  wav = load_mp3_16k_mono(FILEPATH)
  audio_slices = tf.keras.utils.timeseries_dataset_from_array(wav, wav, sequence_length=48000, sequence_stride=48000, batch_size=1)
  audio_slices = audio_slices.map(preprocess_mp3)
  audio_slices = audio_slices.batch(64)
  yhat = model.predict(audio_slices)
  results[file] = yhat

In [None]:
results

Now Lets convert the predictions into Classes

In [None]:
class_preds = {}
for file, logits in results.items():
  class_preds[file] = [1 if prediction > 0.99 else 0 for predictions in logits]
class_preds

Fainaly lets do the group consecutive Detection

In [None]:
postprocessed = {}
for file, scores in class_preds.items():
  postprocessed[file] = tf.math.reduce_sum([key for key, group in groupby(scores)]).numpy()
postprocessed


Export Results

In [None]:
import csv
with open('results.csv', 'w', newline='') as f:
  writer = csv.writer(f, delimiter=',')
  writer.writerow(['recording', 'caapuchin_calls'])
  for key, value in postprocessed.items():
    writer.writerow([key, value])