# Predictions Iteration #2

In [1]:
import tensorflow as tf
import keras
from keras.models import Sequential
from keras.layers import Conv2D, MaxPooling2D
from keras.layers import Flatten, Dense
from tensorflow.keras.applications import MobileNetV2
from tensorflow.keras.applications.mobilenet import preprocess_input
import os
import librosa
import librosa.display
import matplotlib.pyplot as plt
import numpy as np
from tensorflow.keras.preprocessing import image
import soundfile as sf
import shutil
from glob import glob
import pandas as pd

In [2]:
base_model = MobileNetV2(weights='imagenet', include_top=False, input_shape=(224, 224, 3))
model = tf.keras.models.load_model('../02_train/tl_model_with_noise.h5')

In [58]:
import os
import librosa
import librosa.display
import matplotlib.pyplot as plt
import numpy as np
from tensorflow.keras.preprocessing import image
from tensorflow.keras.applications.mobilenet import preprocess_input
import soundfile as sf
import shutil

# Function to create spectrogram and return it as NumPy array
def create_spectrogram(chunk_audio, sr):
    ms = librosa.feature.melspectrogram(y=chunk_audio, sr=sr)
    log_ms = librosa.power_to_db(ms, ref=np.max)
    return log_ms  # return spectrogram (log mel spec) 

# Function to preprocess spectrogram data and return as model input
def create_chunk_image_data(log_ms, sr, chunk_num):
    
    fig = plt.figure()
    ax = fig.add_subplot(1, 1, 1)
    fig.subplots_adjust(left=0, right=1, bottom=0, top=1)
    librosa.display.specshow(log_ms, sr=sr)
    img_file = f'temp_spec_{chunk_num}.png'
    fig.savefig(img_file)
    plt.close(fig)
    x = image.load_img(img_file, target_size=(224, 224))
    x = image.img_to_array(x)
    x = np.expand_dims(x, axis=0)
    x = preprocess_input(x)
    os.remove(img_file)
    
    return x

# Predict on each chunk/window
def predict_chunk(x):
    
    #predicting features using mobilevnet2
    y = base_model.predict(x,verbose=0)
    
    #predicting class of call using features extracted
    pred = model.predict(y, verbose=0)
    
    return pred

# function to process WAV file with sliding window and overlap
def process_wav_with_overlap(wav_file, chunk_size=1.0, overlap=0):
    
    # Extract the recording's unique id 
    wav_id = (wav_file.split('/')[-1]).split('.')[0]
    print(f'Recording ID: {wav_id} \n') 
    
    # Load in wav file
    y, sr = librosa.load(wav_file, sr=44100)
    print(f'Recording properties: len(y) = {len(y)}, sr (Hz) = {sr}, duration (s) = {len(y)/sr} \n')

    # Calculate parameters: sample num and hop size
    chunk_size_samples = int(chunk_size * sr)  # convert chunk size from seconds to # samples
    hop_size = int(chunk_size_samples * (1 - overlap))  # hop size based on overlap percentage
    
    print(f'Chunking parameters (# samples) : chunk size = {chunk_size_samples}, hop size = {hop_size} \n')
    
    # Initialize predictions dictionary    
    class_predictions = {
        'unique_id' : [],
        'laugh' : [], 
        'drum' : [], 
        'pik' : [], 
        'noise' : []
    }
    
    chunk_num = 0
    i = 0
    
    # Process the WAV file with sliding window and overlap
    while i + chunk_size_samples <= len(y): # Execute until we get to the end of the recording/before the last window is greater than the end.

        # Extract the current chunk 
        chunk = y[i:i + chunk_size_samples] 
        
        length = len(chunk)
        duration = length / sr
    
        #print(f"{chunk_file} has duration = {duration} (s), length = {length} (samples), sr = {sr} (Hz)")
        
        # Create spectrogram for the chunk
        log_ms = create_spectrogram(chunk, sr)

        # Preprocess spectrogram data for model input
        x = create_chunk_image_data(log_ms, sr, chunk_num)

        # Make prediction for the chunk
        chunk_prediction = predict_chunk(x)
        #print(chunk_prediction, '\n')

        # Store or process the prediction as needed
        class_predictions['laugh'].append(chunk_prediction[0][0])  # Indexing based on category encoded during training 
        class_predictions['drum'].append(chunk_prediction[0][1])
        class_predictions['pik'].append(chunk_prediction[0][2])
        class_predictions['noise'].append(chunk_prediction[0][3])
        
        chunk_num += 1 
        i += hop_size # Increase i (window) by hop_size calculated above
    
    wav_id_list = [wav_id for x in class_predictions['laugh']] # Add corresponding unique identifier for however many windows are available
    class_predictions['unique_id'] += wav_id_list 
    #print(class_predictions)
                   
    df = pd.DataFrame(class_predictions)

    return df

The history saving thread hit an unexpected error (OperationalError('attempt to write a readonly database')).History will not be written to the database.


In [40]:
# Function to visualize predictions for each window.
def plot_predictions(pred_dict, fig_size = (15,5)):
    
    num_chunks = len(pred_dict['laugh'])
    x_values = range(num_chunks)

    # plot
    plt.figure(figsize = fig_size)

    # set theme
    plt.style.use("fivethirtyeight")

    # plot each class
    plt.plot(x_values, pred_dict['laugh'], label='laugh', marker='o', linestyle='-')
    plt.plot(x_values, pred_dict['drum'], label='drum', marker='s', linestyle='-')
    plt.plot(x_values, pred_dict['pik'], label='pik', marker='^', linestyle='-')
    plt.plot(x_values, pred_dict['noise'], label='noise', marker='x', linestyle='-')
    
    plt.xlabel('Window/Index')
    plt.ylabel('Probability')
    plt.title('Probability Time Series for Each Class')
    plt.legend()
    plt.grid(True)
    plt.tight_layout()
    plt.show()

In [None]:
# predict on many recordings that have yet to be tested, default chunk size = 1s, with 0% overlap.
def predict_on_recordings(wav_file_list, chunk_size = 1, overlap = 0):
    
    # initialize dictionary for df and lists for the dictionary
    prediction_df = {}
    u_ids = []
    prob_laugh = []
    prob_drum = []
    prob_pik = []

    for i, wav_file in enumerate(wav_file_list):
                
        # extract the unique id for the recording
        unique_id = wav_file.split('/')[-1].split('.')[0]
        u_ids.append(unique_id)
        
        # predict on the recording using specified chunk_size and overlap parameters
        pred_iter = process_wav_with_overlap(wav_file, chunk_size = chunk_size, overlap=overlap) # returns dictionary of prediction values
        
        # plot the prediction output
        plot_predictions(pred_iter)
        

In [5]:
all_wave_files = glob('/Users/calzada/birdsongs/wood/01_data/wav_files/*.wav')

In [6]:
test = all_wave_files[0]

In [50]:
test_pred = process_wav_with_overlap(test, chunk_size=1, overlap=0.95)

Recording ID: 273072621 

Properties: len(y) = 543900, sr (Hz) = 44100, duration (s) = 12.333333333333334 

Chunking parameters: chunk size (# samples) = 44100, hop size = 2205 



In [51]:
test_pred

Unnamed: 0,unique_id,laugh,drum,pik,noise
0,273072621,0.000000e+00,0.0,0.000000e+00,1.000000e+00
1,273072621,0.000000e+00,0.0,0.000000e+00,1.000000e+00
2,273072621,0.000000e+00,0.0,0.000000e+00,1.000000e+00
3,273072621,0.000000e+00,0.0,0.000000e+00,1.000000e+00
4,273072621,0.000000e+00,0.0,0.000000e+00,1.000000e+00
...,...,...,...,...,...
222,273072621,9.798575e-24,0.0,2.354488e-07,9.999998e-01
223,273072621,0.000000e+00,0.0,1.000000e+00,0.000000e+00
224,273072621,0.000000e+00,0.0,1.000000e+00,0.000000e+00
225,273072621,1.608517e-27,0.0,3.842589e-06,9.999962e-01


## Boolean filter mask to find rows where `noise` is not the maximum value

In [60]:
test_pred['noise'] != test_pred[['laugh', 'drum', 'pik', 'noise']].max(axis=1) 

0      False
1      False
2      False
3      False
4      False
       ...  
222    False
223     True
224     True
225    False
226     True
Length: 227, dtype: bool

In [62]:
from scipy.stats import multinomial

In [61]:
test_pred[test_pred['noise'] != test_pred[['laugh', 'drum', 'pik', 'noise']].max(axis=1)]

Unnamed: 0,unique_id,laugh,drum,pik,noise
29,273072621,0.0,0.0,1.0,2.346629e-11
30,273072621,0.0,0.0,1.0,1.150084e-24
31,273072621,0.0,0.0,1.0,0.0
32,273072621,0.0,0.0,1.0,0.0
33,273072621,0.0,0.0,1.0,0.0
34,273072621,0.0,0.0,1.0,0.0
35,273072621,0.0,0.0,1.0,0.0
36,273072621,0.0,0.0,1.0,0.0
37,273072621,0.0,0.0,1.0,0.0
38,273072621,0.0,0.0,1.0,0.0
