In [3]:
%load_ext autoreload
%autoreload 2

import librosa
import pandas as pd
import numpy as np
import os
import h5py
import tensorflow as tf
from tensorflow import keras
import tensorflow_ranking as tfr

from os import path
from config import *


2024-09-22 07:55:09.187376: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-09-22 07:55:09.187420: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-09-22 07:55:09.321899: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-09-22 07:55:09.583264: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [4]:
def get_frames(hdf5_dataset):
    recordings = np.array(hdf5_dataset)
    rec =  recordings[0]
    embeddings_frames = []
    for rec in recordings:
        embeddings_frames.extend([
            (rec, i) for i in range(hdf5_dataset[rec]['X'].shape[0])
        ])
        
    return embeddings_frames
    
    
hdf5_dataset = h5py.File(INTERMEDIATE / 'embeddings_20p.hdf5', 'r')
embeddings_frames = get_frames(hdf5_dataset)

### Labelled, unlabelled and test set init

In [5]:

np.random.seed(0)
np.random.shuffle(embeddings_frames) 

X = np.array([hdf5_dataset[rec]['X'][frame, :] for (rec, frame) in embeddings_frames])
Y = np.array([hdf5_dataset[rec]['Y'][:, frame] for (rec, frame) in embeddings_frames])
n_frames = len(X) 

In [6]:
n_frames = len(X) 
test_cut =  int(n_frames * 0.8)
label_cut = int(test_cut * 0.8)

# test data
other_X, other_Y = X[:test_cut],  Y[:test_cut]
test_X, test_Y = X[test_cut:],  Y[test_cut:]

# labelled and unlabelled data
unlabelled_X, unlabelled_Y = other_X[:label_cut], other_Y[:label_cut]
labelled_X, labelled_Y = other_X[label_cut:], other_Y[label_cut:]

u, t = len(unlabelled_X), len(labelled_Y),  
print(f'{u} unlabelled \n {t} labelled \n {(t/u)*100}% initial labelling budget')
print(f'\ntest XY lens: {len(test_X), len(test_Y)}')
print(f'unlabelled XY lens: {len(unlabelled_X), len(unlabelled_Y)}')
print(f'labelled XY lens: {len(labelled_X), len(labelled_Y)}')

282931 unlabelled 
 70733 labelled 
 25.000088360766405% initial labelling budget

test XY lens: (88416, 88416)
unlabelled XY lens: (282931, 282931)
labelled XY lens: (70733, 70733)


### Model & Training Utils

In [7]:
from keras.layers import Input, Dense, BatchNormalization
from functools import partial
from keras import metrics
from tensorflow.keras.callbacks import TensorBoard, ModelCheckpoint, EarlyStopping

def create_model() -> keras.Model:
    default_dense =  partial(Dense, activation='relu', 
                             kernel_initializer=keras.initializers.LecunNormal(seed=0)
                            ) 
    
    return keras.Sequential([
            Input(shape=(1024,)),
            default_dense(512), 
            # default_dense(64), 
            Dense(4, activation='sigmoid',)
    ])

def compile(model):
    model.compile(
        optimizer='adam',
        loss='binary_crossentropy',  
        metrics=[
            metrics.Recall(thresholds=0.5),
            metrics.Precision(thresholds=0.5),
            metrics.AUC(curve='pr', name='auc_pr'),
            metrics.AUC(curve='roc', name='auc_roc'), # not applicitory
            tfr.keras.metrics.get(key="map", name="metric/map"),
        ]
    )

def train(model, X, Y, model_dir, stopping_patience=5, 
          stopping_moniter='loss', **kwargs):
    model_dir.mkdir(exist_ok=True)

    # tensorboard
    log_dir = model_dir / "logs" / "fit"
    log_dir.mkdir(parents=True, exist_ok=True)
    tensorboard_callback = TensorBoard(log_dir=log_dir)

    # checkpoints
    checkpoint_path = model_dir / "training"
    Path(checkpoint_path).mkdir(exist_ok=True)  
    cp_callback = keras.callbacks.ModelCheckpoint(
        checkpoint_path / 'checkpoint.weights.h5', 
        save_weights_only=True,
        verbose=1, 
    )

    # early stopping
    es_checkpoint = EarlyStopping(
        monitor=stopping_moniter,
        patience=stopping_patience,
        restore_best_weights=True,
    )
    
    # fit
    history = model.fit(
        x=X,
        y=Y,
        verbose=2,
        callbacks=[tensorboard_callback, cp_callback, es_checkpoint],
        **kwargs
    )
    model.save(model_dir / 'model.keras')
    df = pd.DataFrame(history.history)
    df.to_csv(model_dir / "logs/history.csv")
    return history


### AL

In [10]:
def AL_simulation(unlabelled: tuple, intial_labelled: tuple, test: tuple,
                    model, name, sampling_query, pool_size, evaluate_metrics,
                    num_iterations=10, epochs=10):
    
    X_unlabelled, Y_unlabelled = unlabelled
    X_labelled, Y_labelled = intial_labelled
    X_test, Y_test = test
    
    histories = []
    iteration_metrics = {}
    dir = MODEL_DIR / name
    Path(dir).mkdir(exist_ok=True)
    
    for i in range(num_iterations):
        print(f" --- Iteration {i + 1} --- ")
        
        labelling_budget =  len(Y_labelled) / len(Y_unlabelled)
        id = f"{name}_i={i}_LB={int(labelling_budget)}"
        
        history = train(
            model, X_labelled, Y_labelled,
            epochs=epochs,
            batch_size=128,
            model_dir=MODEL_DIR / dir / id, 
        )
        histories.append(history)
        
        # get metrics from test set
        y_pred = model.predict(X_test)
        y_true = Y_test
        iteration_metrics[id] = evaluate_metrics(y_true, y_pred)
        
        # active learning qeary on unlabelled set
        y_pred = model.predict(X_unlabelled)
        indexes_pool = sampling_query(y_pred, unlabelled_X, pool_size)
        X_new = [X_unlabelled[i] for i in indexes_pool]
        Y_new = [Y_unlabelled[i] for i in indexes_pool]
        
        # add the newly 'annotated' samples to the labelled set        
        X_labelled = np.vstack((X_labelled, X_new))
        Y_labelled = np.vstack((Y_labelled, Y_new))
        
        # remove the new samples from the unlabelled set
        X_unlabelled = np.delete(X_unlabelled, indexes_pool, axis=0)
        Y_unlabelled = np.delete(Y_unlabelled, indexes_pool, axis=0)
        
        print("current iteration metrics: ", iteration_metrics)
        pd.DataFrame(history.history).to_csv(dir / id / 'hist.csv')
        np.save(dir / 'metrics.npy', iteration_metrics)
     
    return iteration_metrics, histories 

In [11]:
model = create_model()
compile(model)

def least_confidence_sampling(y_pred, unlabelled_X, pool_size):
    # Calculate least confidence (1 - max probability)
    least_confidence_scores = 1 - np.max(y_pred, axis=1)

    # Select the least confident instances
    uncertain_indices = np.argsort(least_confidence_scores)[:pool_size]
    return uncertain_indices

def precision_recall(y_true, y_pred):
    classwize = {}
    for i in range(4):
        m = keras.metrics.Precision(thresholds=0.5)
        m.update_state(y_true[:, i], y_pred[:, i])
        precision = m.result().numpy()
        
        m = keras.metrics.Recall(thresholds=0.5)
        m.update_state(y_true[:, i], y_pred[:, i])
        recall = m.result().numpy()
        
        classwize[i] = (precision, recall)
    
    return classwize    

iteration_metrics, histories = AL_simulation(
    
        unlabelled=(unlabelled_X, unlabelled_Y), 
        intial_labelled=(labelled_X, labelled_Y), 
        test=(test_X, test_Y),
        model=model, 
        name='least_confidence_sampling', 
        evaluate_metrics=precision_recall,
        pool_size= int(len(X) / 20), # iterations
        num_iterations=20,
        sampling_query=least_confidence_sampling
        
)



 --- Iteration 1 --- 


Epoch 1/10

Epoch 1: saving model to /home/ec2-user/acoustic-AL/models/least_confidence_sampling/least_confidence_sampling_i=0_LB=0/training/checkpoint.weights.h5
553/553 - 4s - loss: 0.0183 - recall_1: 0.0065 - precision_1: 0.0338 - auc_pr: 0.0213 - auc_roc: 0.8032 - metric/map: 0.0083 - 4s/epoch - 7ms/step
Epoch 2/10

Epoch 2: saving model to /home/ec2-user/acoustic-AL/models/least_confidence_sampling/least_confidence_sampling_i=0_LB=0/training/checkpoint.weights.h5
553/553 - 2s - loss: 0.0125 - recall_1: 0.0545 - precision_1: 0.7500 - auc_pr: 0.1776 - auc_roc: 0.8863 - metric/map: 0.0093 - 2s/epoch - 4ms/step
Epoch 3/10

Epoch 3: saving model to /home/ec2-user/acoustic-AL/models/least_confidence_sampling/least_confidence_sampling_i=0_LB=0/training/checkpoint.weights.h5
553/553 - 2s - loss: 0.0117 - recall_1: 0.0908 - precision_1: 0.7527 - auc_pr: 0.2300 - auc_roc: 0.9078 - metric/map: 0.0093 - 2s/epoch - 4ms/step
Epoch 4/10

Epoch 4: saving model to /home/ec2-user/acoustic-AL/models

2024-09-22 09:10:57.789232: W external/local_tsl/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 1158885376 exceeds 10% of free system memory.


current iteration metrics:  {'least_confidence_sampling_i=0_LB=0': {0: (0.7, 0.044444446), 1: (0.7266187, 0.541555), 2: (0.0, 0.0), 3: (0.72727275, 0.046511628)}}


AttributeError: 'numpy.ndarray' object has no attribute 'save'

In [2]:
y_pred = model(test_X)

m = keras.metrics.Precision(thresholds=0.5)
m.update_state(test_Y[:, 0], y_pred[:, 0])
m.result()
                                                                                                                                                                                                                                3: (0.71428573, 0.10471204)}

NameError: name 'model' is not defined

In [None]:
def get_predictions(model, batched_test_dataset):
    y_true, y_pred = [], []
    for batch_X, batch_y in batched_test_dataset, desc='loading/predicting test ds':
        predictions = model(batch_X, training=False)
        y_true.extend(batch_y.numpy())
        y_pred.extend(predictions.numpy()) 

    return np.array(y_true), np.array(y_pred)


def evaluate_metrics(y_true, y_pred):
    y_true_flat = y_true.reshape(-1, 4)
    y_pred_flat = y_pred.reshape(-1, 4)
    
    m = keras.metrics.AUC(curve='roc')
    m.update_state(y_true_flat[:, 0], y_pred_flat[:, 0])
    m.result()
    
    m = keras.metrics.AUC(curve='pr')
    m.update_state(y_true_flat[:, 0], y_pred_flat[:, 0])
    m.result()
    
    for i in range(4):
        print('\n class ', i)
        y_pred_binary = (y_pred_flat[:, i] >= threshold).astype(int)

        precision_metric = Precision()
        recall_metric = Recall()

        precision_metric.update_state(y_true_flat[:, i], y_pred_binary)
        recall_metric.update_state(y_true_flat[:, i], y_pred_binary)

        precision = precision_metric.result().numpy()
        recall = recall_metric.result().numpy()

        print(f"Precision: {precision}")
        print(f"Recall: {recall}")
        
        ...
            