# Adversarial Optimization for Dictionary Attacks on Speaker Verification

In [1]:
import decimal
from keras import backend as K
from keras.models import Model, load_model
from keras.layers.core import Flatten
from keras.layers import Input, Dot
import librosa
import logging
import math
import numpy as np
import operator
import pandas as pd
import pickle
import random
from scipy import spatial
from scipy.signal import lfilter
import soundfile as sf
import warnings
warnings.filterwarnings('ignore')

Using TensorFlow backend.


In [7]:
def save_obj(obj,name):
    with open(name + '.pkl', 'wb') as f:
        pickle.dump(obj, f, pickle.HIGHEST_PROTOCOL)
        
def load_obj(name):
    with open(name + '.pkl', 'rb') as f:
        return pickle.load(f)

## -- Acoustic Features Extraction: Spectrograms

Please set the parameters for spectrogram extraction

In [2]:
acoustic_params = {'max_sec':10, 'bucket_step':1, 'frame_step':0.01, 'sample_rate':16000, 'preemphasis_alpha':0.97, 'frame_len':0.025, 'num_fft':512}

In [57]:
def get_fft_spectrum(filename, params):
    signal = load_wav(filename, params['sample_rate'])
    frames = framesig(signal, frame_len=params['frame_len'] * params['sample_rate'], frame_step=params['frame_step']*params['sample_rate'], winfunc=np.hamming)
    fft = abs(np.fft.fft(frames,n=params['num_fft']))
    fft_norm, fft_means, fft_stds = normalize_frames(fft.T)
    return fft_norm, fft_means, fft_stds

def load_wav(filename, sample_rate):
    audio, sr = librosa.load(filename, sr=sample_rate, mono=True)
    audio = audio.flatten()
    return audio

def framesig(sig, frame_len, frame_step, winfunc=lambda x: np.ones((x,)), stride_trick=True):
    slen = len(sig)
    frame_len = int(round_half_up(frame_len))
    frame_step = int(round_half_up(frame_step))
    if slen <= frame_len:
        numframes = 1
    else:
        numframes = 1 + int(math.ceil((1.0 * slen - frame_len) / frame_step)) # LV

    padlen = int((numframes - 1) * frame_step + frame_len)

    zeros = np.zeros((padlen - slen,))
    padsignal = np.concatenate((sig, zeros))
    if stride_trick:
        win = winfunc(frame_len)
        frames = rolling_window(padsignal, window=frame_len, step=frame_step)
    else:
        indices = np.tile(np.arange(0, frame_len), (numframes, 1)) + np.tile(np.arange(0, numframes * frame_step, frame_step), (frame_len, 1)).T
        indices = np.array(indices, dtype=np.int32)
        frames = padsignal[indices]
        win = np.tile(winfunc(frame_len), (numframes, 1))

    return frames * win

def deframesig(frames, siglen, frame_len, frame_step, winfunc=lambda x: np.ones((x,))):
    frame_len = round_half_up(frame_len)
    frame_step = round_half_up(frame_step)
    numframes = np.shape(frames)[0]
    assert np.shape(frames)[1] == frame_len, '"frames" matrix is wrong size, 2nd dim is not equal to frame_len'

    indices = np.tile(np.arange(0, frame_len), (numframes, 1)) + np.tile(
        np.arange(0, numframes * frame_step, frame_step), (frame_len, 1)).T
    indices = np.array(indices, dtype=np.int32)
    padlen = (numframes - 1) * frame_step + frame_len

    if siglen <= 0: siglen = padlen

    rec_signal = np.zeros((padlen,))
    window_correction = np.zeros((padlen,))
    win = winfunc(frame_len)

    for i in range(0, numframes):
        window_correction[indices[i, :]] = window_correction[
                                               indices[i, :]] + win + 1e-15  # add a little bit so it is never zero
        rec_signal[indices[i, :]] = rec_signal[indices[i, :]] + frames[i, :]

    rec_signal = rec_signal / window_correction
    return rec_signal[0:siglen]

    return frames * win

def normalize_frames(m,epsilon=1e-12):
    frames = []
    means = []
    stds = []
    for v in m:
        means.append(np.mean(v))
        stds.append(np.std(v))
        frames.append((v - np.mean(v)) / max(np.std(v), epsilon))
    return np.array(frames), np.array(means), np.array(stds)

def denormalize_frames(m, means, stds, epsilon=1e-12):
    return np.array([z * max(stds[i],epsilon) + means[i] for i, z in enumerate(m)])

def rolling_window(a, window, step=1):
    shape = a.shape[:-1] + (a.shape[-1] - window + 1, window)
    strides = a.strides + (a.strides[-1],)
    return np.lib.stride_tricks.as_strided(a, shape=shape, strides=strides)[::step]

def round_half_up(number):
    return int(decimal.Decimal(number).quantize(decimal.Decimal('1'), rounding=decimal.ROUND_HALF_UP))

## -- Bottleneck Features Extraction: VGGVox-Vectors

In [4]:
bottleneck = load_model('vggvox.h5') # VGGVox Keras Model

## -- Training Dataset

In [74]:
utterance_paths = load_obj('../data/backup/train_vox2_paths_1000_users') # Pickle File: An array including a list of paths that are used for training: 50 utterance per person 
utterance_paths[:2] # Sample data

['../data/voxceleb2/dev/id02331/6dSawXx_NNk/00033.m4a',
 '../data/voxceleb2/dev/id02331/LJdtecWIS3A/00066.m4a']

In [73]:
utterance_labels = load_obj('../data/backup/train_vox2_labels_1000_users') # Pickle File: An array including a list of labels that corresponds to paths in utterance_paths
utterance_labels[:2] # Sample data

[16, 16]

In [71]:
utterance_bottleneck_features = np.load('../data/backup/train_vox2_embs_1000_users.npy') # Numpy File: A 2D matrix including the embedding vectors for paths in utterance_paths
utterance_bottleneck_features[:2] # Sample data

array([[ 7.78697291e-03,  4.98584891e-03,  1.73048340e-02, ...,
         3.47861648e-02, -5.03421434e-05, -3.13258581e-02],
       [ 8.77610594e-03,  2.80432142e-02,  1.07790232e-02, ...,
         4.44943644e-02,  1.52959935e-02, -9.63459723e-03]])

Please set the *user_id_position* index, e.g., path/to/voxceleb2/dev/user_id/video_id/file.wav -> user_id_position = 4 

In [14]:
user_id_position = 4
indexes_male_utterances = []
indexes_female_utterances = []
vox_metadata = pd.read_csv('vox2_meta.csv', header=None, names=['vid', 'vggid', 'gender', 'set']) # CSV File: A CSV containing the metadata of the VoxCeleb2 dataset
for p_index, path in enumerate(utterance_paths):
    print('\rPath', p_index+1, '/', len(utterance_paths), end='')
    if vox_metadata.loc[vox_metadata.vid == path.split('/')[user_id_position], 'gender'].values[0] == 'm':
        indexes_male_utterances.append(p_index)
    else:
        indexes_female_utterances.append(p_index)

Path 50000 / 50000

## -- Adversarial Optimization

Please set the parameters for optimization

In [75]:
batch_size = 16
n_iterations = 100
learning_rate = 10
min_similarity = -0.25
max_similarity = 0.25
min_change = 1e-5
indexes_optimization = indexes_male_utterances

In [76]:
bottleneck_extractor = Model(bottleneck.inputs, Flatten()(bottleneck.output))

in_a = Input(shape=(512, None, 1))
in_b = Input(shape=(512, None, 1))
inputs = [in_a, in_b]

emb_a = bottleneck_extractor(in_a)
emb_b = bottleneck_extractor(in_b)
similarity = Dot(axes=1, normalize=True)([emb_a, emb_b])

siamese = Model(inputs, similarity)

model_input_layer = [siamese.layers[0].input, siamese.layers[1].input]
model_output_layer =  siamese.layers[-1].output

cost_function = model_output_layer[0][0]

gradient_function = K.gradients(cost_function, model_input_layer)[0]

grab_cost_and_gradients_from_model = K.function(model_input_layer, [cost_function, gradient_function])

filter_gradients = lambda c, g, t1, t2: [g[i] for i in range(len(c)) if c[i] >= t1 and c[i] <= t2]

In [77]:
def evaluate_fac(spectrogram, bottleneck_extractor, utterance_paths, utterance_bottleneck_features, threshold=0.53):
    bottleneck_features = bottleneck_extractor.predict(spectrogram.reshape(1, *spectrogram.shape, 1))[0]
    similarities = [1 - spatial.distance.cosine(bottleneck_features, utterance_bottleneck_features[i]) for i in range(len(utterance_paths))]
    fac = np.sum([1 for s in similarities if s > threshold])
    return fac

Please set the path of the starting waveform

In [78]:
starting_path = 'whatever_voice.wav'
starting_spectrogram, starting_waveform_mean, starting_waveform_std = get_fft_spectrum(starting_path, acoustic_params)

for iteration in range(n_iterations):
    
    costs = []
    gradients = []
    for index in random.sample(indexes_optimization, batch_size):
        base_spectrogram, _, _ = get_fft_spectrum(utterance_paths[index], acoustic_params)
        input_pair =  ([np.array([starting_spectrogram.reshape(*starting_spectrogram.shape, 1)]), np.array([base_spectrogram.reshape(*base_spectrogram.shape, 1)])])
        cost, gradient = grab_cost_and_gradients_from_model(input_pair)
        costs.append(np.squeeze(cost))
        gradients.append(np.squeeze(gradient))
        
    filtered_gradients = filter_gradients(costs, gradients, min_similarity, max_similarity)
    
    if len(filtered_gradients) > 0:
        perturbation = np.mean(filtered_gradients, axis=0) * learning_rate
        perturbation = np.clip(perturbation, min_change, None)
        starting_spectrogram += perturbation
        fac = evaluate_fac(starting_spectrogram, bottleneck_extractor, utterance_paths, utterance_bottleneck_features, threshold=0.53)
        print('\rStep ' + str(iteration + 1) + '/' + str(n_iterations), '- FAC', fac, end='')

Step 100/100 - FAC 10754

## -- Invertion to Waveform

In [79]:
def stft_for_reconstruction(x, params):
    frames = framesig(x, frame_len=params['frame_len'] * params['sample_rate'], frame_step=params['frame_step'] * params['sample_rate'], winfunc=np.hamming)
    fft_norm = np.fft.fft(frames, n=params['num_fft'])
    return fft_norm

def istft_for_reconstruction(X, total_lenght,  params):
    frames = np.fft.ifft(X, n=params['num_fft'])
    x = deframesig(frames, total_lenght, frame_len=params['num_fft'], frame_step=params['frame_step']*params['sample_rate'], winfunc=np.hamming)
    return x

def reconstruct_signal_griffin_lim(total_lenght, fft, params, iterations):
    x_reconstruct = np.random.randn(total_lenght)
    n = iterations
    while n > 0:
        n -= 1
        reconstruction_spectrogram = stft_for_reconstruction(x_reconstruct, params)
        reconstruction_angle = np.angle(reconstruction_spectrogram)
        proposal_spectrogram = fft * np.exp(1.0j * reconstruction_angle)
        prev_x = x_reconstruct
        x_reconstruct = istft_for_reconstruction(proposal_spectrogram, total_lenght, params)
        diff = np.sqrt(np.sum((fft - abs(proposal_spectrogram))**2)/fft.size)
        print('\rReconstruction iteration: {}/{} RMSE: {} '.format(iterations - n, iterations, diff), end='')
    return x_reconstruct

Please set the path for the resulting master voice

In [80]:
master_voice_path = 'master_voice.wav'
iterations_griffin_lim = 300
starting_waveform, starting_rate = librosa.load(starting_path, sr=acoustic_params['sample_rate'], mono=True)
denormalized_starting_spectrogram = denormalize_frames(starting_spectrogram, starting_waveform_mean, starting_waveform_std)
x_reconstruct = reconstruct_signal_griffin_lim(len(starting_waveform), denormalized_starting_spectrogram.T, acoustic_params, iterations=iterations_griffin_lim)
sf.write(master_voice_path, x_reconstruct, acoustic_params['sample_rate'])

Reconstruction iteration: 300/300 RMSE: 1.4660382896075883e-17 

In [82]:
starting_voice_spectrogram, starting_voice_mean, starting_voice_std = get_fft_spectrum(starting_path, acoustic_params)
fac = evaluate_fac(starting_voice_spectrogram, bottleneck_extractor, utterance_paths, utterance_bottleneck_features, threshold=0.53)
print('Starting Voice FAC', fac, end='')

Starting Voice FAC 2462

In [81]:
master_voice_spectrogram, master_voice_mean, master_voice_std = get_fft_spectrum(master_voice_path, acoustic_params)
fac = evaluate_fac(master_voice_spectrogram, bottleneck_extractor, utterance_paths, utterance_bottleneck_features, threshold=0.53)
print('Master Voice FAC', fac, end='')

Master Voice FAC 6279