Implementation of different methods for detecting input noise data automatically

In [24]:
from ModelsArchitecture import BinarizedInputNetwork, ConvNet , BinarizedWeightNetwork
import torch
import torch.nn.functional as F
from torch.utils.data import DataLoader, TensorDataset

In [6]:
KEYWORDS =  list(set(['yes', 'no', 'up', 'down', 'left', 'right', 'on', 'off', 'stop', 'go'] + ['unknown', 'silence']))
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
TESTSET_PATH = '/home/luciano/Downloads/speech_commands_test_set_v0.02/'

SAMPLE_RATE = 16000
BATCH_SIZE = 16
N_MELS = 64
N_FFT = 512
POWER = 2.0
F_MIN = 50.0
F_MAX = 7500.0
HOP_LENGTH=round(SAMPLE_RATE*0.01)
WIN_LENGTH=round(SAMPLE_RATE*0.025)
DURATION_SEC = 1.0

In [None]:
# load models
input_shape = (1, 64, 101)  # Example input shape, adjust as needed
num_classes = len(KEYWORDS)  # Example number of classes, adjust as needed

full_precision_model = ConvNet().to(device)
full_precision_model.load_state_dict(torch.load("full_precision.pth"))

BinaryWeights_model = BinarizedWeightNetwork(input_shape, num_classes).to(device)
BinaryWeights_model.load_state_dict(torch.load("BinaryWeights.pth"))

full_binary_model = BinarizedInputNetwork(input_shape, num_classes).to(device)
full_binary_model.load_state_dict(torch.load("full_binary_model.pth"))

  full_precision_model.load_state_dict(torch.load("full_precision.pth"))
  BinaryWeights_model.load_state_dict(torch.load("BinaryWeights.pth"))
  full_binary_model.load_state_dict(torch.load("full_binary_model.pth"))


<All keys matched successfully>

In [10]:
import torch
import torchaudio
import numpy as np
from spectrogram import AudioDataset
import scipy.signal as signal

class AddNoiseOnTestset(AudioDataset):
    """
    This class adds noise to the spectrogram audio data.
    """
    def __init__(self, audio_path, 
                 n_mels, n_fft,hop_length,
                 win_length,f_min,f_max,
                 power,sample_rate,duration_seconds,
                 keywords, device,noise_type=None,snr_db=3.0):
        
        TEST_LIST = "testing_list.txt"
        with open(audio_path+TEST_LIST, 'r') as f:
            audio_list = f.read().splitlines()

        super().__init__(audio_path, audio_list,sample_rate,duration_seconds,keywords,device)
        self.noise_type = noise_type
        self.snr_db = snr_db
        # white, pink, babble and classes from the UrbanSoundsDataset
        self.noise_type_allowed = [None,'white',
                                   'pink',
                                   'babble',
                                   'air_conditioner_background',
                                   'car_horn_background',
                                   'children_playing_background',
                                   'dog_bark_background',
                                    'drilling_background',
                                    'engine_idling_background',
                                    'gun_shot_background',
                                    'jackhammer_background',
                                    'siren_background',
                                    'street_music_background',
                                    'air_conditioner_foreground',
                                   'car_horn_foreground',
                                   'children_playing_foreground',
                                   'dog_bark_foreground',
                                    'drilling_foreground',
                                    'engine_idling_foreground',
                                    'gun_shot_foreground',
                                    'jackhammer_foreground',
                                    'siren_foreground',
                                    'street_music_foreground']

        assert self.noise_type in self.noise_type_allowed, "Noise type not allowed"  # TODO: implement more noise types
        if self.noise_type != None:
            self.noise_signal = self.__load_noise_signal()

        transformation = torchaudio.transforms.MelSpectrogram(
            n_fft=n_fft,
            hop_length=hop_length,
            n_mels=n_mels,
            win_length=win_length,
            f_min=f_min,
            f_max=f_max,
            center=True,
            power = power,
            sample_rate=sample_rate,
        )
        self.mel_spectrogram = transformation.to(self._device)
        

    def __load_noise_signal(self):
        if self.noise_type == 'white':
            noise_signal = self.__white_noise()
        elif self.noise_type == 'pink':
            noise_signal = self.__pink_noise()
        elif self.noise_type == 'babble':
            noise_signal = self.__babble_noise()
        else:
            noise_signal = self.__urban_sounds_noise()
        return noise_signal

    def __getitem__(self, index):
        if self.noise_type != None:
            signal, label = super().__getitem__(index)
            # Add noise to the signal
            signal_with_noise = torchaudio.functional.add_noise(waveform=torch.squeeze(signal),
                                                                noise=torch.squeeze(self.noise_signal),
                                                                snr=torch.tensor(self.snr_db),
                                                                lengths=None)
            signal_with_noise = torch.unsqueeze(signal_with_noise, 0)
            signal_with_noise = signal_with_noise.to(self._device)
            mel_spectro = self.mel_spectrogram(signal_with_noise)
        
        else:
            signal, label = super().__getitem__(index)
            mel_spectro = self.mel_spectrogram(signal)
        # Convert to decibels
        mel_spectro = torchaudio.transforms.AmplitudeToDB()(mel_spectro)
        return mel_spectro, label

        
    def __white_noise(self):
        noise = torch.randn(self._num_samples,device=self._device)
        return noise
    
    def __pink_noise(self):
        # Generate white noise
        self._num_samples
        white_noise = torch.randn(self._num_samples)

        # Apply a filter to shape the white noise into pink noise
        b, a = signal.butter(4, 0.05, 'highpass')  # High-pass filter to remove DC component
        filtered_noise = signal.lfilter(b, a, white_noise)
        
        fft = torch.fft.rfft(torch.tensor(filtered_noise))
        frequencies = torch.fft.rfftfreq(self._num_samples, d=1/self._sample_rate)
        pink_filter = 1 / torch.sqrt(np.abs(frequencies + 1e-10))  # Avoid division by zero
        pink_filter[0] = 0  # Remove DC component
        # pink_filter = pink_filter.to(self._device)
        pink_fft = fft * pink_filter

        # Inverse FFT to get the time-domain signal
        pink_noise = torch.fft.irfft(pink_fft)
        pink_noise = pink_noise.to(torch.float32)

        return pink_noise.to(self._device)
    
    def __babble_noise(self):
        noise,sr = torchaudio.load("./noise/noisex-92/babble.wav")
        
        noise = self._resample_if_necessary(noise, sr)
        noise = noise.to(self._device)
        noise = self._cut_if_necessary(noise)
        noise = self._right_pad_if_necessary(noise)
        return noise

    def __urban_sounds_noise(self): #TODO: read the wav file name from the csv file
        wav_dict = {
                    'air_conditioner_background' : "177621-0-0-0.wav",
                    'car_horn_background' : "132073-1-0-0.wav",
                    'children_playing_background' : "135776-2-0-32.wav",
                    'dog_bark_background' : "102106-3-0-0.wav",
                    'drilling_background' : "17913-4-1-0.wav",
                    'engine_idling_background' : "46918-5-0-0.wav",
                    'gun_shot_background' : "135527-6-0-0.wav",
                    'jackhammer_background' : "180937-7-3-0.wav",
                    'siren_background' : "106905-8-0-0.wav",
                    'street_music_background' : "132016-9-0-0.wav",
                    'air_conditioner_foreground' : "127873-0-0-0.wav",
                    'car_horn_foreground' : "145577-1-0-0.wav",
                    'children_playing_foreground' : "105415-2-0-1.wav",
                    'dog_bark_foreground' : "101415-3-0-2.wav",
                    'drilling_foreground' : "103199-4-0-0.wav",
                    'engine_idling_foreground' : "103258-5-0-0.wav",
                    'gun_shot_foreground' : "102305-6-0-0.wav",
                    'jackhammer_foreground' : "103074-7-0-0.wav",
                    'siren_foreground' : "157867-8-0-0.wav",
                    'street_music_foreground' : "108041-9-0-11.wav"
                    }
        
        noise,sr = torchaudio.load("./noise/UrbanSound8K/" + wav_dict[self.noise_type])
        
        noise = self._resample_if_necessary(noise, sr)
        noise = noise.to(self._device)
        noise = self._cut_if_necessary(noise)
        noise = self._right_pad_if_necessary(noise)
        noise = self._mix_down_if_necessary(noise)
        return noise
    
    def _mix_down_if_necessary(self, signal):
        if signal.shape[0] > 1:
            signal = torch.mean(signal, dim=0, keepdim=True)
        return signal

# 1) Confidence level of the output

In [9]:
# load the whole dataset in memory
def load_in_memory(data_set):
    all_data = []
    all_labels = []
    for data, label in data_set:
        all_data.append(data)
        all_labels.append(label)
    return all_data, all_labels

In [87]:
noiseless_testset = AddNoiseOnTestset(
                audio_path=TESTSET_PATH,
                n_mels=N_MELS,
                n_fft=N_FFT, 
                hop_length=HOP_LENGTH,
                win_length=WIN_LENGTH,
                f_min=F_MIN,
                f_max=F_MAX,
                power=POWER,
                sample_rate=SAMPLE_RATE,
                duration_seconds=DURATION_SEC,
                keywords=KEYWORDS,
                device=device
            )

In [88]:
from sklearn.metrics import accuracy_score

def test_model_confidence(test_loader,model,device):
    model.eval()  # Set the model to evaluation mode
    all_preds = []
    all_labels = []
    all_confidence = []

    with torch.no_grad():
        for inputs, labels in test_loader:
            inputs, labels = inputs.to(device), labels.to(device)
            outputs = model(inputs)
            preds = outputs.argmax(dim=1)
            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())


            probabilidades = F.softmax(outputs, dim=1)
            confidence, _ = torch.max(probabilidades, dim=1)
            all_confidence.extend(confidence)

    accuracy = accuracy_score(all_labels, all_preds)
    return accuracy , (all_labels), all_preds, all_confidence

First, compute the confidence for the noiseless testset

In [89]:
# Load the noisy test set into memory
all_data, all_labels = load_in_memory(noiseless_testset)
test_tensor = TensorDataset(torch.stack(all_data), torch.tensor(all_labels))
test_loader = DataLoader(test_tensor, batch_size=BATCH_SIZE, shuffle=False)

# Test the models with the current combination of SNR and noise type
acc_full_precision, ground_truth_labels , predicted_labels, outputs_confidence = test_model_confidence(test_loader, full_precision_model, device)

In [90]:
def confidence_labels(ground_truth_labels, outputs_confidence):
    # mean confidence values for each label
    mean_confidence_per_label = {}

    outputs_confidence = torch.stack(outputs_confidence)

    for label in set(ground_truth_labels):
        confidence_filtered = outputs_confidence[ground_truth_labels == label]
        mean_confidence = torch.mean(confidence_filtered)
        mean_confidence_per_label[label.item()] = mean_confidence.item()


    for label, mean_confidence in mean_confidence_per_label.items():
        print(f"Mean confidence for label {label} ('{sorted(KEYWORDS)[label]}'): {mean_confidence:.4f}")

    return mean_confidence_per_label

confidence_noiseless = confidence_labels(ground_truth_labels, outputs_confidence)

mean_value = torch.mean(torch.stack(outputs_confidence))
print(f"Mean confidence for all labels: {mean_value:.4f}")

Mean confidence for label 0 ('down'): 0.9747
Mean confidence for label 1 ('go'): 0.9769
Mean confidence for label 2 ('left'): 0.9900
Mean confidence for label 3 ('no'): 0.9844
Mean confidence for label 4 ('off'): 0.9687
Mean confidence for label 5 ('on'): 0.9790
Mean confidence for label 6 ('right'): 0.9871
Mean confidence for label 7 ('silence'): 0.9660
Mean confidence for label 8 ('stop'): 0.9944
Mean confidence for label 9 ('unknown'): 0.9349
Mean confidence for label 10 ('up'): 0.9826
Mean confidence for label 11 ('yes'): 0.9923
Mean confidence for all labels: 0.9776


Now, I compare the confidence of the outputs when the test data have noise

In [91]:
# pink noise at 24 db
noisy_testset = AddNoiseOnTestset(
                audio_path=TESTSET_PATH,
                n_mels=N_MELS,
                n_fft=N_FFT, 
                hop_length=HOP_LENGTH,
                win_length=WIN_LENGTH,
                f_min=F_MIN,
                f_max=F_MAX,
                power=POWER,
                sample_rate=SAMPLE_RATE,
                duration_seconds=DURATION_SEC,
                keywords=KEYWORDS,
                device=device,
                noise_type="pink",
                snr_db=24
            )

In [92]:
# Load the noisy test set into memory
all_data, all_labels = load_in_memory(noisy_testset)
test_tensor = TensorDataset(torch.stack(all_data), torch.tensor(all_labels))
test_loader = DataLoader(test_tensor, batch_size=BATCH_SIZE, shuffle=False)

# Test the models with the current combination of SNR and noise type
acc_full_precision, ground_truth_labels , predicted_labels, outputs_confidence = test_model_confidence(test_loader, full_precision_model, device)

In [93]:
def confidence_labels(ground_truth_labels, outputs_confidence):
    # mean confidence values for each label
    mean_confidence_per_label = {}

    outputs_confidence = torch.stack(outputs_confidence)

    for label in set(ground_truth_labels):
        confidence_filtered = outputs_confidence[ground_truth_labels == label]
        mean_confidence = torch.mean(confidence_filtered)
        mean_confidence_per_label[label.item()] = mean_confidence.item()


    for label, mean_confidence in mean_confidence_per_label.items():
        print(f"Mean confidence for label {label} ('{sorted(KEYWORDS)[label]}'): {mean_confidence:.4f}")

    return mean_confidence_per_label

confidence_pink_3db = confidence_labels(ground_truth_labels, outputs_confidence)

mean_value = torch.mean(torch.stack(outputs_confidence))
print(f"Mean confidence for all labels: {mean_value:.4f}")

Mean confidence for label 0 ('down'): 0.9560
Mean confidence for label 1 ('go'): 0.9716
Mean confidence for label 2 ('left'): 0.9817
Mean confidence for label 3 ('no'): 0.9837
Mean confidence for label 4 ('off'): 0.9468
Mean confidence for label 5 ('on'): 0.9762
Mean confidence for label 6 ('right'): 0.9798
Mean confidence for label 7 ('silence'): 0.9677
Mean confidence for label 8 ('stop'): 0.9800
Mean confidence for label 9 ('unknown'): 0.9254
Mean confidence for label 10 ('up'): 0.9760
Mean confidence for label 11 ('yes'): 0.9892
Mean confidence for all labels: 0.9696


difference in the ouput confidence

In [95]:
confidence_noiseless

{0: 0.9747442603111267,
 1: 0.9768653512001038,
 2: 0.9899606704711914,
 3: 0.9843631982803345,
 4: 0.9686923027038574,
 5: 0.979042112827301,
 6: 0.987056314945221,
 7: 0.9659756422042847,
 8: 0.9943723678588867,
 9: 0.9349432587623596,
 10: 0.9825799465179443,
 11: 0.9923374056816101}

In [97]:
for label in set(ground_truth_labels):
    print(f"Confidence diffence for label {label} ('{sorted(KEYWORDS)[label]}'): {confidence_noiseless[label]-confidence_pink_3db[label]:.4f}")

Confidence diffence for label 0 ('down'): 0.0187
Confidence diffence for label 1 ('go'): 0.0052
Confidence diffence for label 2 ('left'): 0.0083
Confidence diffence for label 3 ('no'): 0.0006
Confidence diffence for label 4 ('off'): 0.0218
Confidence diffence for label 5 ('on'): 0.0029
Confidence diffence for label 6 ('right'): 0.0072
Confidence diffence for label 7 ('silence'): -0.0018
Confidence diffence for label 8 ('stop'): 0.0144
Confidence diffence for label 9 ('unknown'): 0.0096
Confidence diffence for label 10 ('up'): 0.0066
Confidence diffence for label 11 ('yes'): 0.0031
