In [16]:
import torchaudio
import torch.nn.functional as F
from speechbrain.inference.speaker import EncoderClassifier

classifier = EncoderClassifier.from_hparams(source="speechbrain/spkrec-ecapa-voxceleb")

reference, rate = torchaudio.load('./Datasets/audio_to_anon/LibriSpeech/dev-clean/1462/170138/1462-170138-0001.flac')
reference_emb = classifier.encode_batch(reference)

speaker, _ = torchaudio.load('./Datasets/audio_to_anon/LibriSpeech/dev-clean/2086/149214/2086-149214-0000.flac')
speaker_emb = classifier.encode_batch(speaker)

anonim, _ = torchaudio.load('./converted_sound_Waveunet_test/*.flac/1462/170138/1462-170138-0000.flac')
anonim_emb = classifier.encode_batch(anonim)

room, _ = torchaudio.load('./add_room.wav')
room_emb = classifier.encode_batch(room)

In [17]:
F.cosine_similarity(reference_emb, speaker_emb, dim=2)

tensor([[0.0579]])

In [13]:
F.cosine_similarity(reference_emb, anonim_emb, dim=2)

tensor([[0.2726]])

In [10]:
F.cosine_similarity(reference_emb, room_emb, dim=2)

tensor([[0.2385]])

In [4]:
torchaudio.save('processed_audio.wav', anonim, 16000)

# Verification accuracy

## ECAPA

In [17]:
import torch
import torch.nn.functional as F
import torchaudio
from speechbrain.inference.speaker import EncoderClassifier
import re
import glob
from tqdm.auto import tqdm 
from IPython.display import clear_output
import numpy as np

In [18]:
def find_id(file_path: str, pattern: str = None) -> str:
    pattern = pattern or r'\/(\d{2,4})\/'
    return re.search(pattern, file_path).group(1)

def parse_ids(file_paths: list[str]) -> list[str]:
    return list(set(find_id(file_path) for file_path in file_paths))

def find_all_speaker_audio(folder: str, speaker_id: str) -> list[str]:
    return glob.glob(folder + speaker_id + '/*/*.flac')

def load_audios(speaker_files: list[str]) -> list[torch.Tensor]:
    audios = []
    
    for file in speaker_files:
        audio, rate = torchaudio.load(file)
        assert rate == 16000
        audios.append(audio.to(device))
    
    return audios

def check_verification_model(model: torch.nn.Module, speaker_id: str, folder: str, num_samples: int = 10, threshhold: float = 0.25):
    speaker_files = find_all_speaker_audio(folder, speaker_id)
    
    speaker_files_subset = list(np.random.choice(speaker_files, num_samples, replace=False))
    referense_file = np.random.choice(find_all_speaker_audio(original_folder, speaker_id))
    
    audios = load_audios(speaker_files)
    reference_audio, _ = torchaudio.load(referense_file)
    
    reference_embedding = model.encode_batch(reference_audio)
    
    speaker_embeddings = []
    for audio in audios:
        speaker_embeddings.append(model.encode_batch(audio))
    
    scores = []
    for speaker_embedding in speaker_embeddings:
        scores.append(F.cosine_similarity(speaker_embedding, reference_embedding, dim=2).cpu().item())
        
    scores = np.array(scores)
    accuracy = (scores > threshhold).sum() / len(scores)
    
    return accuracy

In [19]:
converted_files = glob.glob('./converted_sound_Waveunet_test/*.flac/*/*/*.flac')
ids = parse_ids(converted_files)

print(f'Number of audios: {len(converted_files)},\nNumber of speakers: {len(ids)}')

Number of audios: 2703,
Number of speakers: 40


In [4]:
converted_folder = './converted_sound_Waveunet_test/*.flac/'
original_folder = './Datasets/audio_to_anon/LibriSpeech/dev-clean/'
rir_folder = './processed/'

num_samples = 10
device = 'cuda:0'

scores_by_ids = dict.fromkeys(ids)
scores_by_ids_anonim = dict.fromkeys(ids)
scores_by_ids_rir = dict.fromkeys(ids)

model = EncoderClassifier.from_hparams(source="speechbrain/spkrec-ecapa-voxceleb", run_opts={"device": device})

for i, speaker_id in enumerate(ids):
    # my tqdm
    clear_output(wait=False)
    print(i)
    
    scores_by_ids[speaker_id] = check_verification_model(model, speaker_id, original_folder, threshhold=0.3, num_samples=20)
    scores_by_ids_anonim[speaker_id] = check_verification_model(model, speaker_id, converted_folder, threshhold=0.3, num_samples=20)
    scores_by_ids_rir[speaker_id] = check_verification_model(model, speaker_id, rir_folder, threshhold=0.3, num_samples=20)

39


In [7]:
print(f'Initial accuracy: {np.mean(list(scores_by_ids.values())):.2f}')
print(f'Anonimized accuracy: {np.mean(list(scores_by_ids_anonim.values())):.2f}')
print(f'RIR accuracy: {np.mean(list(scores_by_ids_rir.values())):.2f}')

Initial accuracy: 0.99
Anonimized accuracy: 0.12
RIR accuracy: 0.05


## Pyannonte

In [67]:
from pyannote.audio import Model
model = Model.from_pretrained("pyannote/embedding", 
                              use_auth_token="your_hugginface_token")

from pyannote.audio import Inference
inference = Inference(model, window="whole")

Lightning automatically upgraded your loaded checkpoint from v1.2.7 to v2.2.5. To apply the upgrade to your files permanently, run `python -m pytorch_lightning.utilities.upgrade_checkpoint ../../../.cache/torch/pyannote/models--pyannote--embedding/snapshots/4db4899737a38b2d618bbd74350915aa10293cb2/pytorch_model.bin`
Lightning automatically upgraded your loaded checkpoint from v1.2.7 to v2.2.5. To apply the upgrade to your files permanently, run `python -m pytorch_lightning.utilities.upgrade_checkpoint ../../../.cache/torch/pyannote/models--pyannote--embedding/snapshots/4db4899737a38b2d618bbd74350915aa10293cb2/pytorch_model.bin`


Model was trained with pyannote.audio 0.0.1, yours is 3.2.0. Bad things might happen unless you revert pyannote.audio to 0.x.
Model was trained with torch 1.8.1+cu102, yours is 2.3.0+cu118. Bad things might happen unless you revert torch to 1.x.
Model was trained with pyannote.audio 0.0.1, yours is 3.2.0. Bad things might happen unless you revert pyannote.audio to 0.x.
Model was trained with torch 1.8.1+cu102, yours is 2.3.0+cu118. Bad things might happen unless you revert torch to 1.x.


In [70]:
def check_verification_model(model: torch.nn.Module, speaker_id: str, folder: str, num_samples: int = 10, threshhold: float = 0.25):
    with torch.no_grad():
        speaker_files = find_all_speaker_audio(folder, speaker_id)

        speaker_files_subset = list(np.random.choice(speaker_files, num_samples, replace=False))
        referense_file = np.random.choice(find_all_speaker_audio(original_folder, speaker_id))

        audios = load_audios(speaker_files)
        reference_audio, _ = torchaudio.load(referense_file)

        reference_embedding = model(reference_audio.to(device))

        speaker_embeddings = []
        for audio in audios:
            speaker_embeddings.append(model(audio))

        scores = []
        for speaker_embedding in speaker_embeddings:
            scores.append(F.cosine_similarity(speaker_embedding, reference_embedding, dim=1).cpu().item())

        scores = np.array(scores)
        accuracy = (scores > threshhold).sum() / len(scores)
    
    return accuracy

In [71]:
converted_folder = './converted_sound_Waveunet_test/*.flac/'
original_folder = './Datasets/audio_to_anon/LibriSpeech/dev-clean/'
rir_folder = './processed/'

num_samples = 10
device = 'cuda:0'

scores_by_ids = dict.fromkeys(ids)
scores_by_ids_anonim = dict.fromkeys(ids)
scores_by_ids_rir = dict.fromkeys(ids)

model.to(device)

for i, speaker_id in enumerate(ids):
    # my tqdm
    clear_output(wait=False)
    print(i)
    
    scores_by_ids[speaker_id] = check_verification_model(model, speaker_id, original_folder, threshhold=0.3, num_samples=20)
    scores_by_ids_anonim[speaker_id] = check_verification_model(model, speaker_id, converted_folder, threshhold=0.3, num_samples=20)
    scores_by_ids_rir[speaker_id] = check_verification_model(model, speaker_id, rir_folder, threshhold=0.3, num_samples=20)

39


In [72]:
print(f'Initial accuracy: {np.mean(list(scores_by_ids.values())):.2f}')
print(f'Anonimized accuracy: {np.mean(list(scores_by_ids_anonim.values())):.2f}')
print(f'RIR accuracy: {np.mean(list(scores_by_ids_rir.values())):.2f}')

Initial accuracy: 0.97
Anonimized accuracy: 0.07
RIR accuracy: 0.02


# Wespeaker

In [5]:
import wespeaker

model = wespeaker.load_model('english')



In [27]:
def wespeaker_inference(model: torch.nn.Module, sample: torch.Tensor) -> torch.Tensor:
    with torch.no_grad():
        sample = model.compute_fbank(sample, sample_rate=model.resample_rate, cmn=True)
        sample = sample.unsqueeze(0)
        sample = sample.to(model.device)
        outputs = model.model(sample)
        outputs = outputs[-1] if isinstance(outputs, tuple) else outputs
        embed = outputs[0]
    return embed
    

def check_verification_model(model: torch.nn.Module, speaker_id: str, folder: str, num_samples: int = 10, threshhold: float = 0.25):
    speaker_files = find_all_speaker_audio(folder, speaker_id)
    
    speaker_files_subset = list(np.random.choice(speaker_files, num_samples, replace=False))
    referense_file = np.random.choice(find_all_speaker_audio(original_folder, speaker_id))
    
    audios = load_audios(speaker_files)
    reference_audio, _ = torchaudio.load(referense_file)
    
    reference_embedding = wespeaker_inference(model, reference_audio)
    
    speaker_embeddings = []
    for audio in audios:
        speaker_embeddings.append(wespeaker_inference(model, audio))
    
    scores = []
    for speaker_embedding in speaker_embeddings:
        scores.append(F.cosine_similarity(speaker_embedding.unsqueeze(0), reference_embedding.unsqueeze(0), dim=1).cpu().item())
        
    scores = np.array(scores)
    accuracy = (scores > threshhold).sum() / len(scores)
    
    return accuracy

In [28]:
converted_folder = './converted_sound_Waveunet_test/*.flac/'
original_folder = './Datasets/audio_to_anon/LibriSpeech/dev-clean/'
rir_folder = './processed/'

scores_by_ids = dict.fromkeys(ids)
scores_by_ids_anonim = dict.fromkeys(ids)
scores_by_ids_rir = dict.fromkeys(ids)

for i, speaker_id in enumerate(ids):
    # my tqdm
    clear_output(wait=False)
    print(i)
    
    scores_by_ids[speaker_id] = check_verification_model(model, speaker_id, original_folder, threshhold=0.3, num_samples=20)
    scores_by_ids_anonim[speaker_id] = check_verification_model(model, speaker_id, converted_folder, threshhold=0.3, num_samples=20)
    scores_by_ids_rir[speaker_id] = check_verification_model(model, speaker_id, rir_folder, threshhold=0.3, num_samples=20)

39


In [29]:
print(f'Initial accuracy: {np.mean(list(scores_by_ids.values())):.2f}')
print(f'Anonimized accuracy: {np.mean(list(scores_by_ids_anonim.values())):.2f}')
print(f'RIR accuracy: {np.mean(list(scores_by_ids_rir.values())):.2f}')

Initial accuracy: 0.99
Anonimized accuracy: 0.44
RIR accuracy: 0.26


# WER

In [8]:
device = 'cuda:0'

In [9]:
from transformers import Speech2TextProcessor, Speech2TextForConditionalGeneration
import torchaudio

model = Speech2TextForConditionalGeneration.from_pretrained("facebook/s2t-small-librispeech-asr")
processor = Speech2TextProcessor.from_pretrained("facebook/s2t-small-librispeech-asr")

model.to(device)

audio, _ = torchaudio.load('./Datasets/audio_to_anon/LibriSpeech/dev-clean/1272/128104/1272-128104-0000.flac')
inputs = processor(audio.squeeze(), sampling_rate=16_000, return_tensors="pt")
generated_ids = model.generate(inputs["input_features"].to(device), attention_mask=inputs["attention_mask"].to(device))

transcription = processor.batch_decode(generated_ids, skip_special_tokens=True)
transcription



['mister quilter is the apostle of the middle classes and we are glad to welcome his gospel']

In [12]:
import pandas as pd
import glob
import re
from evaluate import load
wer = load("wer")

In [13]:
def load_txt(path: str) -> pd.DataFrame:
    texts = pd.read_csv(path, header=None)
    texts.columns = ['text']
    texts[['code', 'transcription']] = texts.text.str.split(' ', n=1, expand=True)
    return texts


def transcribe_audio(path_to_audio: str) -> str:
    audio, _ = torchaudio.load(path_to_audio)
    inputs = processor(audio.squeeze(), sampling_rate=16_000, return_tensors="pt")
    generated_ids = model.generate(inputs["input_features"].to(device), attention_mask=inputs["attention_mask"].to(device))

    transcription = processor.batch_decode(generated_ids, skip_special_tokens=True)
    return transcription

In [29]:
from jiwer import wer

first_folders = glob.glob('./Datasets/audio_to_anon/LibriSpeech/dev-clean/*')

wer_score = 0
num_files = 0

for i, first_folder in enumerate(first_folders):
    clear_output(wait=False)
    print(i)
    
    second_folders = glob.glob(first_folder + '/*')

    for second_folder in second_folders:
    
        third_folders = glob.glob(second_folder + '/*.flac')
        text_file = glob.glob(second_folder + '/*.txt')[0]
        
        texts_df = load_txt(text_file)
        
        for audio_path in third_folders:
            prediction = transcribe_audio(audio_path)[0]
            
            code = re.search(r'/([^/]+)\.flac$', audio_path).group(1)
            reference = texts_df.loc[texts_df.code == code].transcription.tolist()[0]
            
            # print(f'Reference: {reference.lower()}')
            # print(f'Prediction: {prediction[0].lower()}')
            
            wer_score += wer(reference.lower(), prediction.lower())
            num_files += 1

average_wer = wer_score / num_files
print(f"Average WER: {average_wer}")

39




Average WER: 0.03800558941382623


In [32]:
from jiwer import wer

first_folders = glob.glob('./converted_sound_Waveunet_test/*.flac/*')

wer_score = 0
num_files = 0

for i, first_folder in enumerate(first_folders):
    clear_output(wait=False)
    print(i)
    
    second_folders = glob.glob(first_folder + '/*')

    for second_folder in second_folders:
    
        third_folders = glob.glob(second_folder + '/*.flac')
        
        txt_folder = second_folder.replace('./converted_sound_Waveunet_test/*.flac/', './Datasets/audio_to_anon/LibriSpeech/dev-clean/')
        text_file = glob.glob(txt_folder + '/*.txt')[0]
        
        texts_df = load_txt(text_file)
        
        for audio_path in third_folders:
            prediction = transcribe_audio(audio_path)[0]
            
            code = re.search(r'/([^/]+)\.flac$', audio_path).group(1)
            reference = texts_df.loc[texts_df.code == code].transcription.tolist()[0]
            
            # print(f'Reference: {reference.lower()}')
            # print(f'Prediction: {prediction[0].lower()}')
            
            wer_score += wer(reference.lower(), prediction.lower())
            num_files += 1

average_wer = wer_score / num_files
print(f"Average WER: {average_wer}")

39




Average WER: 0.14456822142189207


In [33]:
from jiwer import wer

first_folders = glob.glob('./processed/*')

wer_score = 0
num_files = 0

for i, first_folder in enumerate(first_folders):
    clear_output(wait=False)
    print(i)
    
    second_folders = glob.glob(first_folder + '/*')

    for second_folder in second_folders:
    
        third_folders = glob.glob(second_folder + '/*.flac')
        
        txt_folder = second_folder.replace('./processed/', './Datasets/audio_to_anon/LibriSpeech/dev-clean/')
        text_file = glob.glob(txt_folder + '/*.txt')[0]
        
        texts_df = load_txt(text_file)
        
        for audio_path in third_folders:
            prediction = transcribe_audio(audio_path)[0]
            
            code = re.search(r'/([^/]+)\.flac$', audio_path).group(1)
            reference = texts_df.loc[texts_df.code == code].transcription.tolist()[0]
            
            # print(f'Reference: {reference.lower()}')
            # print(f'Prediction: {prediction[0].lower()}')
            
            wer_score += wer(reference.lower(), prediction.lower())
            num_files += 1

average_wer = wer_score / num_files
print(f"Average WER: {average_wer}")

39


  return F.conv1d(input, weight, bias, self.stride,


Average WER: 0.28955901365841685
