# Adversarial Optimization for Dictionary Attacks on Speaker Verification

In [1]:
import decimal 
from keras import backend as K
from keras.models import Model, load_model
from keras.layers.core import Flatten
from keras.layers import Input, Dot
import librosa
import logging
import math
import numpy as np
import operator
import pandas as pd
import pickle
import random
from scipy import spatial
from scipy.signal import lfilter
import soundfile as sf
import warnings
warnings.filterwarnings('ignore')

Using TensorFlow backend.


**Please open this Notebook on Prince Cluster**

In [2]:
def save_obj(obj,name):
    with open(name + '.pkl', 'wb') as f:
        pickle.dump(obj, f, pickle.HIGHEST_PROTOCOL)
        
def load_obj(name):
    with open(name + '.pkl', 'rb') as f:
        return pickle.load(f)

## -- Acoustic Features Extraction: Spectrograms

Please set the parameters for spectrogram extraction

In [3]:
acoustic_params = {'max_sec': 10, 'bucket_step': 1, 'frame_step': 0.01, 'sample_rate': 16000, 'preemphasis_alpha': 0.97, 'frame_len': 0.025, 'num_fft': 512}

In [4]:
def get_fft_spectrum(filename, params):
    signal = load_wav(filename, params['sample_rate'])
    frames = framesig(signal, frame_len=params['frame_len'] * params['sample_rate'], frame_step=params['frame_step']*params['sample_rate'], winfunc=np.hamming)
    fft = abs(np.fft.fft(frames,n=params['num_fft']))
    fft_norm, fft_means, fft_stds = normalize_frames(fft.T)
    return fft_norm, fft_means, fft_stds

def load_wav(filename, sample_rate):
    audio, sr = librosa.load(filename, sr=sample_rate, mono=True)
    audio = audio.flatten()
    return audio

def framesig(sig, frame_len, frame_step, winfunc=lambda x: np.ones((x,)), stride_trick=True):
    slen = len(sig)
    frame_len = int(round_half_up(frame_len))
    frame_step = int(round_half_up(frame_step))
    if slen <= frame_len:
        numframes = 1
    else:
        numframes = 1 + int(math.ceil((1.0 * slen - frame_len) / frame_step)) # LV

    padlen = int((numframes - 1) * frame_step + frame_len)

    zeros = np.zeros((padlen - slen,))
    padsignal = np.concatenate((sig, zeros))
    if stride_trick:
        win = winfunc(frame_len)
        frames = rolling_window(padsignal, window=frame_len, step=frame_step)
    else:
        indices = np.tile(np.arange(0, frame_len), (numframes, 1)) + np.tile(np.arange(0, numframes * frame_step, frame_step), (frame_len, 1)).T
        indices = np.array(indices, dtype=np.int32)
        frames = padsignal[indices]
        win = np.tile(winfunc(frame_len), (numframes, 1))

    return frames * win

def deframesig(frames, siglen, frame_len, frame_step, winfunc=lambda x: np.ones((x,))):
    frame_len = round_half_up(frame_len)
    frame_step = round_half_up(frame_step)
    numframes = np.shape(frames)[0]
    assert np.shape(frames)[1] == frame_len, '"frames" matrix is wrong size, 2nd dim is not equal to frame_len'

    indices = np.tile(np.arange(0, frame_len), (numframes, 1)) + np.tile(
        np.arange(0, numframes * frame_step, frame_step), (frame_len, 1)).T
    indices = np.array(indices, dtype=np.int32)
    padlen = (numframes - 1) * frame_step + frame_len

    if siglen <= 0: siglen = padlen

    rec_signal = np.zeros((padlen,))
    window_correction = np.zeros((padlen,))
    win = winfunc(frame_len)

    for i in range(0, numframes):
        window_correction[indices[i, :]] = window_correction[
                                               indices[i, :]] + win + 1e-15  # add a little bit so it is never zero
        rec_signal[indices[i, :]] = rec_signal[indices[i, :]] + frames[i, :]

    rec_signal = rec_signal / window_correction
    return rec_signal[0:siglen]

    return frames * win

def normalize_frames(m,epsilon=1e-12):
    frames = []
    means = []
    stds = []
    for v in m:
        means.append(np.mean(v))
        stds.append(np.std(v))
        frames.append((v - np.mean(v)) / max(np.std(v), epsilon))
    return np.array(frames), np.array(means), np.array(stds)

def denormalize_frames(m, means, stds, epsilon=1e-12):
    return np.array([z * max(stds[i],epsilon) + means[i] for i, z in enumerate(m)])

def rolling_window(a, window, step=1):
    shape = a.shape[:-1] + (a.shape[-1] - window + 1, window)
    strides = a.strides + (a.strides[-1],)
    return np.lib.stride_tricks.as_strided(a, shape=shape, strides=strides)[::step]

def round_half_up(number):
    return int(decimal.Decimal(number).quantize(decimal.Decimal('1'), rounding=decimal.ROUND_HALF_UP))

## -- Invertion to Waveform

In [18]:
def stft_for_reconstruction(x, params):
    frames = framesig(x, frame_len=params['frame_len'] * params['sample_rate'], frame_step=params['frame_step'] * params['sample_rate'], winfunc=np.hamming)
    fft_norm = np.fft.fft(frames, n=params['num_fft'])
    return fft_norm

def istft_for_reconstruction(X, total_lenght,  params):
    frames = np.fft.ifft(X, n=params['num_fft'])
    x = deframesig(frames, total_lenght, frame_len=params['num_fft'], frame_step=params['frame_step']*params['sample_rate'], winfunc=np.hamming)
    return x

def reconstruct_signal_griffin_lim(total_lenght, fft, params, iterations):
    x_reconstruct = np.random.randn(total_lenght)
    n = iterations
    while n > 0:
        n -= 1
        reconstruction_spectrogram = stft_for_reconstruction(x_reconstruct, params)
        reconstruction_angle = np.angle(reconstruction_spectrogram)
        proposal_spectrogram = fft * np.exp(1.0j * reconstruction_angle)
        prev_x = x_reconstruct
        x_reconstruct = istft_for_reconstruction(proposal_spectrogram, total_lenght, params)
        diff = np.sqrt(np.sum((fft - abs(proposal_spectrogram))**2)/fft.size)
        print('\rReconstruction iteration: {}/{} RMSE: {} '.format(iterations - n, iterations, diff), end='')
    return x_reconstruct

Please set the path for the resulting master voice

In [19]:
master_voice_path = 'master_voice.wav'
iterations_griffin_lim = 300
starting_waveform, starting_rate = librosa.load(starting_path, sr=acoustic_params['sample_rate'], mono=True)
denormalized_starting_spectrogram = denormalize_frames(starting_spectrogram, starting_waveform_mean, starting_waveform_std)
x_reconstruct = reconstruct_signal_griffin_lim(len(starting_waveform), denormalized_starting_spectrogram.T, acoustic_params, iterations=iterations_griffin_lim)
sf.write(master_voice_path, x_reconstruct, acoustic_params['sample_rate'])

Reconstruction iteration: 300/300 RMSE: 1.2251139773251946e-17 

In [20]:
starting_voice_spectrogram, starting_voice_mean, starting_voice_std = get_fft_spectrum(starting_path, acoustic_params)
fac = evaluate_fac(starting_voice_spectrogram, bottleneck_extractor, utterance_paths, utterance_bottleneck_features, threshold=0.53)
print('Starting Voice FAC', fac, end='')

Starting Voice FAC 2462

In [21]:
master_voice_spectrogram, master_voice_mean, master_voice_std = get_fft_spectrum(master_voice_path, acoustic_params)
fac = evaluate_fac(master_voice_spectrogram, bottleneck_extractor, utterance_paths, utterance_bottleneck_features, threshold=0.53)
print('Master Voice FAC', fac, end='')

Master Voice FAC 6364

In [22]:
print('Please find your Master Voice at', master_voice_path)

Please find your Master Voice at master_voice.wav
