In [1]:
import warnings
warnings.filterwarnings('ignore')

In [1]:
import os
import librosa
import numpy as np
import scipy
import argparse
from transformers import Wav2Vec2Tokenizer, Wav2Vec2ForCTC, pipeline, Speech2TextForConditionalGeneration, Speech2TextProcessor
import torch
from datasets import load_dataset
import soundfile as sf
from pyannote.audio import Inference, Model
import numpy as np
from speechbrain.utils.metric_stats import EER
from scipy.spatial.distance import cdist
from pyannote.audio import Model
from torcheval.metrics import WordErrorRate
from time import time
from tqdm import tqdm

2024-03-19 03:07:22.222822: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-03-19 03:07:22.578637: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-03-19 03:07:22.578701: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-03-19 03:07:22.638346: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-03-19 03:07:22.760648: I tensorflow/core/platform/cpu_feature_guar

In [3]:
# !pip install torcheval
# !pip install SentencePiece
# !pip install datasets transformers[sentencepiece]
# !pip install sentencepiece

## McAdams LPC

In [2]:
import numpy as np
import librosa
import scipy.signal
import scipy.io.wavfile
import os
from datasets import load_dataset

class McAdamsLPC:
    def __init__(self, window_ms=20, shift_ms=10, lp_order=20, mcadams_coeff=0.8):
        self.window_ms = window_ms
        self.shift_ms = shift_ms
        self.lp_order = lp_order
        self.mcadams_coeff = mcadams_coeff

    def load_audio_file(self, file_path):
        if not os.path.exists(file_path):
            raise FileNotFoundError(f"{file_path} not found.")
        return librosa.load(file_path, sr=None)

    def calculate_window_parameters(self, sample_rate):
        window_len_samples = int(np.floor(self.window_ms * 0.001 * sample_rate))
        shift_samples = int(np.floor(self.shift_ms * 0.001 * sample_rate))
        return window_len_samples, shift_samples

    def process_signal(self, signal, sample_rate, window_len_samples, shift_samples):
        num_samples = len(signal)
        num_frames = 1 + (num_samples - window_len_samples) // shift_samples
        signal_rec = np.zeros(num_samples)
        for frame_num in range(1, num_frames):
            frame, out_index = self.extract_frame(signal, frame_num, window_len_samples, shift_samples)
            frame_rec = self.process_frame(frame, sample_rate)
            signal_rec[out_index] += frame_rec
        return signal_rec / np.max(np.abs(signal_rec))

    def extract_frame(self, signal, frame_num, window_len_samples, shift_samples):
        start = frame_num * shift_samples
        end = min(start + window_len_samples, len(signal))
        frame = signal[start:end] * np.hanning(window_len_samples)
        return frame, np.arange(start, start + len(frame))

    def process_frame(self, frame, sample_rate):
        eps = np.finfo(np.float32).eps
        frame += eps
        a_lpc = librosa.lpc(frame, order=self.lp_order)
        poles = scipy.signal.tf2zpk([1], a_lpc)[1]
        ind_imag = np.where(~np.isreal(poles))[0][::2]
        new_poles = self.apply_mcadams(poles, ind_imag)
        a_lpc_new = np.real(np.poly(new_poles))
        res = scipy.signal.lfilter(a_lpc, [1], frame)
        return scipy.signal.lfilter([1], a_lpc_new, res) * np.hanning(len(frame))

    def apply_mcadams(self, poles, ind_imag_con):
        new_angles = np.angle(poles[ind_imag_con]) ** self.mcadams_coeff
        new_angles = np.clip(new_angles, 0, np.pi)
        new_poles = np.copy(poles)
        for i, idx in enumerate(ind_imag_con):
            r = np.abs(poles[idx])
            new_poles[idx] = r * np.exp(1j * new_angles[i])
            new_poles[idx + 1] = r * np.exp(-1j * new_angles[i])
        return new_poles

    def denoise_signal(self, input_file, output_file, n_fft=2048, hop_length=512, win_length=2048):
        signal, sr = librosa.load(input_file, sr=None)
        stft_signal = librosa.stft(signal, n_fft=n_fft, hop_length=hop_length, win_length=win_length)
        noise_amp = np.median(np.abs(stft_signal), axis=1, keepdims=True)
        spectral_gate = np.mean(noise_amp) * 1.5
        stft_signal_denoised = np.where(np.abs(stft_signal) > spectral_gate, stft_signal, 0)
        signal_denoised = librosa.istft(stft_signal_denoised, hop_length=hop_length, win_length=win_length)
        signal_denoised_normalized = signal_denoised / np.max(np.abs(signal_denoised))
        scipy.io.wavfile.write(output_file, sr, np.float32(signal_denoised_normalized))

    @classmethod
    def anonymize_dataset(cls, dataset_name, dataset_config, split, output_directory,
                          anonymization_params={}, denoising_params={}):
        ds = load_dataset(dataset_name, dataset_config, split=split)
        if not os.path.exists(output_directory):
            os.makedirs(output_directory)
        anon_instances = []
        for i, sample in enumerate(ds):
            file_path = sample["file"]
            output_file_path = os.path.join(output_directory, f"anonymized_denoised_{i}.wav")
            mcadams_instance = cls(**anonymization_params)
            signal, sample_rate = mcadams_instance.load_audio_file(file_path)
            window_len_samples, shift_samples = mcadams_instance.calculate_window_parameters(sample_rate)
            anonymized_signal = mcadams_instance.process_signal(signal, sample_rate, window_len_samples, shift_samples)
            temp_output_path = output_file_path.replace('.wav', '_temp_anonymized.wav')
            mcadams_instance.save_signal(temp_output_path, anonymized_signal, sample_rate)
            mcadams_instance.denoise_signal(temp_output_path, output_file_path, **denoising_params)
            os.remove(temp_output_path)
            anon_instances.append(output_file_path)
            print(f"Processed file {i+1}/{len(ds)}: {output_file_path}")
        return anon_instances, ds

    def save_signal(self, file_path, signal, sample_rate):
        normalized_signal = signal / np.max(np.abs(signal))
        scipy.io.wavfile.write(file_path, sample_rate, np.float32(normalized_signal))


In [3]:
mcd = McAdamsLPC()
all_names, ds = mcd.anonymize_dataset("hf-internal-testing/librispeech_asr_demo", "clean", "validation", "./anonymized_dataset")

Processed file 1/73: ./anonymized_dataset/anonymized_denoised_0.wav
Processed file 2/73: ./anonymized_dataset/anonymized_denoised_1.wav
Processed file 3/73: ./anonymized_dataset/anonymized_denoised_2.wav
Processed file 4/73: ./anonymized_dataset/anonymized_denoised_3.wav
Processed file 5/73: ./anonymized_dataset/anonymized_denoised_4.wav
Processed file 6/73: ./anonymized_dataset/anonymized_denoised_5.wav
Processed file 7/73: ./anonymized_dataset/anonymized_denoised_6.wav
Processed file 8/73: ./anonymized_dataset/anonymized_denoised_7.wav
Processed file 9/73: ./anonymized_dataset/anonymized_denoised_8.wav
Processed file 10/73: ./anonymized_dataset/anonymized_denoised_9.wav
Processed file 11/73: ./anonymized_dataset/anonymized_denoised_10.wav
Processed file 12/73: ./anonymized_dataset/anonymized_denoised_11.wav
Processed file 13/73: ./anonymized_dataset/anonymized_denoised_12.wav
Processed file 14/73: ./anonymized_dataset/anonymized_denoised_13.wav
Processed file 15/73: ./anonymized_data

In [8]:
config = {
    's2t': {
        'model_name': 'facebook/s2t-small-librispeech-asr' 
    },
    'hf_token': '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',
    'save_dir': './audio'
}

# s2t-small-librispeech-asr
_config = config['s2t']

#STT
model = Speech2TextForConditionalGeneration.from_pretrained(_config['model_name'])
processor = Speech2TextProcessor.from_pretrained(_config['model_name'])

Some weights of Speech2TextForConditionalGeneration were not initialized from the model checkpoint at facebook/s2t-small-librispeech-asr and are newly initialized: ['model.decoder.embed_positions.weights', 'model.encoder.embed_positions.weights']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [9]:
def compute_eer( orig_src, anon_src):
    model_emb = Model.from_pretrained("pyannote/embedding", use_auth_token='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')
    inference = Inference(model_emb, window="whole")
    
    orig_embeddings = [
        inference(f) for f in orig_src    
    ]

    anon_embeddings = [
        inference(f) for f in anon_src 
    ]

    emb_list = orig_embeddings + anon_embeddings
    label_list = [1]*len(orig_embeddings) + [-1]*len(anon_embeddings)

    positive_scores = []
    negative_scores = []
    
    for emb1, label1 in zip(emb_list, label_list):
        for emb2, label2 in zip(emb_list, label_list):
            distance = cdist(emb1.reshape(1,-1), emb2.reshape(1,-1), metric="cosine")[0,0]
            score = max(0, 1-distance)
            if label1!=label2:
                negative_scores.append(score)
            else:
                positive_scores.append(score)

    #print(positive_scores, negative_scores)
    val_eer, threshold = EER(torch.tensor(positive_scores), torch.tensor(negative_scores))

    return val_eer

def compute_wer( orig_texts, anon_paths ):
    metric = WordErrorRate()

    #Load and stt anon
    anon_arrs = []
    anon_rates = []
    for p in anon_paths:
        arr, rate = sf.read(p)
        anon_arrs.append(arr)
        anon_rates.append(rate)
    inputs = processor(anon_arrs, sampling_rate=anon_rates[0], return_tensors="pt", padding=True)
    generated_ids = model.generate(inputs["input_features"], attention_mask=inputs["attention_mask"])
    anon_texts = processor.batch_decode(generated_ids, skip_special_tokens=True)

    for org, an in zip(orig_texts, anon_texts):
        metric.update([an.lower()], [org.lower()])

    return metric.compute().item(), orig_texts, anon_texts

In [None]:
compute_eer(
    [a['path'] for a in ds["audio"]], 
    all_names
)

In [10]:
rate, true_ones, anon_ones = compute_wer(ds['text'], all_names)

In [11]:
rate

0.18347826600074768