In [157]:
from transformers import Wav2Vec2Tokenizer, Wav2Vec2ForCTC
import librosa as lb
import torch
from datasets import load_dataset
from transformers import pipeline
import soundfile as sf
from pyannote.audio import Inference, Model
import numpy as np
from speechbrain.utils.metric_stats import EER
from scipy.spatial.distance import cdist
from pyannote.audio import Model
from torcheval.metrics import WordErrorRate
from time import time

In [156]:
#!pip install pyannote.audio
#!pip install speechbrain
#!pip install torcheval

In [110]:
config = {
    's2t': {
        'model_name': 'facebook/s2t-small-librispeech-asr'
    },
    'hf_token': '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',
    'save_dir': './audio'
}

In [11]:
#Dataset
ds = load_dataset("hf-internal-testing/librispeech_asr_demo", "clean", split="validation")

In [128]:
_config = config['s2t']

#STT
model = Speech2TextForConditionalGeneration.from_pretrained(_config['model_name'])
processor = Speech2TextProcessor.from_pretrained(_config['model_name'])

#TTS
synthesiser = pipeline("text-to-speech", "microsoft/speecht5_tts")
embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")
speaker_embedding = torch.tensor(embeddings_dataset[7306]["xvector"]).unsqueeze(0)

#Speaker emb
model_emb = Model.from_pretrained("pyannote/embedding", 
                              use_auth_token=config['hf_token'])

Some weights of Speech2TextForConditionalGeneration were not initialized from the model checkpoint at facebook/s2t-small-librispeech-asr and are newly initialized: ['model.decoder.embed_positions.weights', 'model.encoder.embed_positions.weights']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Lightning automatically upgraded your loaded checkpoint from v1.2.7 to v2.2.1. To apply the upgrade to your files permanently, run `python -m pytorch_lightning.utilities.upgrade_checkpoint C:\Users\razvor\.cache\torch\pyannote\models--pyannote--embedding\snapshots\c6335d8f1cd77b30084387468a6cf26fea90009b\pytorch_model.bin`
Lightning automatically upgraded your loaded checkpoint from v1.2.7 to v2.2.1. To apply the upgrade to your files permanently, run `python -m pytorch_lightning.utilities.upgrade_checkpoint C:\Users\razvor\.cache\torch\pyannote\models--pyannote--embedding\snapshots\c6335d8f1cd77b30084387468a6cf26fea90009b\pytorch_mod

Model was trained with pyannote.audio 0.0.1, yours is 3.1.1. Bad things might happen unless you revert pyannote.audio to 0.x.
Model was trained with torch 1.8.1+cu102, yours is 2.1.2. Bad things might happen unless you revert torch to 1.x.
Model was trained with pyannote.audio 0.0.1, yours is 3.1.1. Bad things might happen unless you revert pyannote.audio to 0.x.
Model was trained with torch 1.8.1+cu102, yours is 2.1.2. Bad things might happen unless you revert torch to 1.x.


In [165]:
def anonimize(dataset):
    t1 = time()
    arrays = [d['array'] for d in dataset['audio'][:]]
    sampling_rates = [d['sampling_rate'] for d in dataset['audio'][:]]

    inputs = processor(arrays, sampling_rate=sampling_rates[0], return_tensors="pt", padding=True)
    generated_ids = model.generate(inputs["input_features"], attention_mask=inputs["attention_mask"])
    transcriptions = processor.batch_decode(generated_ids, skip_special_tokens=True)
    print()
    
    speeches = synthesiser(transcriptions, forward_params={"speaker_embeddings": speaker_embedding})

    t2 = time()
    run_time = (t2 - t1) / len(dataset['audio'])
    print(f'Avg run time per audio {np.round(run_time, 4)}s')
    
    ret_files = []
    for i, s in enumerate(speeches):
        name = config['save_dir']+f'/speech_anon_{i}.wav'
        sf.write(name, s["audio"], samplerate=s["sampling_rate"])
        ret_files.append(name)

    return ret_files
    

In [177]:
anon_files = anonimize(ds)


Avg run time per audio 4.1784s


In [178]:
def compute_eer( orig_src, anon_src ):
    inference = Inference(model_emb, window="whole")
    
    orig_embeddings = [
        inference(f) for f in orig_src    
    ]

    anon_embeddings = [
        inference(f) for f in anon_src 
    ]

    emb_list = orig_embeddings + anon_embeddings
    label_list = [1]*len(orig_embeddings) + [-1]*len(anon_embeddings)

    positive_scores = []
    negative_scores = []
    
    for emb1, label1 in zip(emb_list, label_list):
        for emb2, label2 in zip(emb_list, label_list):
            distance = cdist(emb1.reshape(1,-1), emb2.reshape(1,-1), metric="cosine")[0,0]
            score = max(0, 1-distance)
            if label1!=label2:
                negative_scores.append(score)
            else:
                positive_scores.append(score)

    #print(positive_scores, negative_scores)
    val_eer, threshold = EER(torch.tensor(positive_scores), torch.tensor(negative_scores))

    return val_eer
    

In [183]:
compute_eer(
    [a['path'] for a in ds["audio"]], 
    anon_files[0]
)

0.0

In [None]:
def compute_wer( orig_texts, anon_paths ):
    metric = WordErrorRate()

    #Load and stt anon
    anon_arrs = []
    anon_rates = []
    for p in anon_paths:
        arr, rate = sf.read(p)
        anon_arrs.append(arr)
        anon_rates.append(rate)
    inputs = processor(anon_arrs, sampling_rate=anon_rates[0], return_tensors="pt", padding=True)
    generated_ids = model.generate(inputs["input_features"], attention_mask=inputs["attention_mask"])
    anon_texts = processor.batch_decode(generated_ids, skip_special_tokens=True)

    for org, an in zip(orig_texts, anon_texts):
        metric.update([an.lower()], [org.lower()])

    return metric.compute().item()
    

In [182]:
compute_wer(ds['text'], anon_files[0])

0.08173912763595581