# Speaker Recognition

Please download the archive of the exercise here: https://drive.google.com/file/d/1ILlZEjCPro6PYxfbLNfm6SKSb9tPIYfL/view?usp=sharing

Unzip che archive and copy this notebook inside the extracted folder to run the exercise as it is.

In [1]:
!pip install -q tensorflow==2.0.0
!pip install adversarial-robustness-toolbox[all]
!pip install h5py==2.10.0
!pip install Pillow

# In più
!pip install numpy==1.19.5

[K     |████████████████████████████████| 86.3 MB 33 kB/s 
[K     |████████████████████████████████| 3.8 MB 11.3 MB/s 
[K     |████████████████████████████████| 449 kB 16.9 MB/s 
[K     |████████████████████████████████| 50 kB 6.1 MB/s 
[?25h  Building wheel for gast (setup.py) ... [?25l[?25hdone


In [2]:
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [3]:
# Insert your current Google Drive folder
%cd 'gdrive/MyDrive/AI4/speaker_id'

/content/gdrive/MyDrive/AI4/speaker_id


In [4]:
import librosa

import numpy as np
import pandas as pd
import IPython.display as ipd

from sklearn.preprocessing import LabelEncoder

import tensorflow as tf
tf.compat.v1.disable_eager_execution()

# Audio model
from model import SpeakerID

# ART
import art
from art import config
from art.estimators.classification import KerasClassifier
from art.attacks.evasion import FastGradientMethod

In [5]:
# Settings
checkpoint_path = 'resnet18_mel_25_10_norm.h5'
csv_path = 'database.csv'
file_path = 'id10001.wav'
file_id = 'id10001'

In [6]:
# Read db file list
database = pd.read_csv(csv_path)

# Compute labeles
labeler = LabelEncoder()
labeler.fit(database['VoxCeleb1 ID'])

# Get class num from id
class_num = labeler.transform([file_id])[0]
print(class_num)
# Get id from class num
print(labeler.inverse_transform([class_num])[0])

0
id10001


In [7]:
# Load the model
input_shape = (None, 1)
model = SpeakerID(input_shape, checkpoint_path, n_classes=1251)

Instructions for updating:
If using Keras pass *_constraint arguments to layers.


In [8]:
model.summary()

Model: "model_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         [(None, None, 1)]         0         
_________________________________________________________________
lambda (Lambda)              (None, None, 1)           0         
_________________________________________________________________
log_melgram_layer (LogMelgra (None, None, 128)         0         
_________________________________________________________________
z_score_normalization (ZScor (None, None, 128)         0         
_________________________________________________________________
model (Model)                (None, 1251)              11822374  
Total params: 11,822,374
Trainable params: 11,814,436
Non-trainable params: 7,938
_________________________________________________________________


In [9]:
# Read the audio file
x, _ = librosa.load(file_path, sr=16000)
# Create batch
x = np.expand_dims(x, 0)
# Add channels dimension
x = np.expand_dims(x, -1)

In [10]:
def load_and_play(id = 10001):
  if (id<10001+labeler.classes_.size):
    file_path = f'id{id}.wav'
    file_id = f'id{id}'
    # Read the audio file
    x, _ = librosa.load(file_path, sr=16000)
    # Create batch
    x = np.expand_dims(x, 0)
    # Add channels dimension
    x = np.expand_dims(x, -1)
    ipd.Audio(x[0,:,0], rate=16000, autoplay=False)
    return x
  else:
    print("Invalid id")

In [11]:
ipd.Audio(x[0,:,0], rate=16000, autoplay=False)

In [12]:
# Predict
y_pred = model.predict(x)

# log
print("Predicted class: ", labeler.inverse_transform(np.argmax(y_pred,axis=1))[0], " Actual class: ", file_id)
print("Prediction confidence: ", np.max(y_pred[0]))

Predicted class:  id10001  Actual class:  id10001
Prediction confidence:  0.9999651


# ART

In [13]:
classifier = KerasClassifier(clip_values=(-1, 1), model=model, use_logits=False)



Complete the exercise trying to attack the neural network with error-generic and error-specific attacks.

## BIM

### Error generic with BIM

In [32]:
from art.attacks.evasion import BasicIterativeMethod

epsilon = 0.001
epsilon_step = 0.00005
max_iter = 20

attack = BasicIterativeMethod(estimator=classifier, eps=epsilon, eps_step=epsilon_step, max_iter=max_iter, targeted=False)

atk = attack.generate(x)

PGD - Random Initializations:   0%|          | 0/1 [00:00<?, ?it/s]

PGD - Iterations:   0%|          | 0/20 [00:00<?, ?it/s]

In [33]:
atk_pred = model.predict(atk)
perturbation = np.mean(np.abs((atk - x)))
print('Average perturbation: {:4.2f}\n'.format(perturbation))

#Show attacked sample example
ipd.Audio(atk[0,:,0], rate=16000, autoplay=False)

Average perturbation: 0.00



In [34]:
print("Predicted class: ", labeler.inverse_transform(np.argmax(atk_pred,axis=1))[0], " Actual class: ", file_id)
print("Prediction confidence: ", np.max(atk_pred[0]))

Predicted class:  id10714  Actual class:  id10001
Prediction confidence:  1.0


### Error Specific with BIM

In [35]:
from art.attacks.evasion import BasicIterativeMethod

epsilon = 0.001
max_iter =50
epsilon_step = epsilon/max_iter

attack = BasicIterativeMethod(estimator=classifier, eps=epsilon, eps_step=epsilon_step, max_iter=max_iter, targeted=True)

In [36]:
targeted_labels = 1*np.ones(1)
one_hot_targeted_labels = tf.keras.utils.to_categorical(targeted_labels, num_classes = labeler.classes_.size)

In [37]:
atk = attack.generate(x, one_hot_targeted_labels)

atk_pred = model.predict(atk)
perturbation = np.mean(np.abs((atk - x)))
print('Average perturbation: {:4.2f}\n'.format(perturbation))

#Show attacked sample example
ipd.Audio(atk[0,:,0], rate=16000, autoplay=False)

PGD - Random Initializations:   0%|          | 0/1 [00:00<?, ?it/s]

PGD - Iterations:   0%|          | 0/50 [00:00<?, ?it/s]

Average perturbation: 0.00



In [39]:
print("Predicted class: ", labeler.inverse_transform(np.argmax(atk_pred,axis=1))[0], " Actual class: ", file_id)
print("Prediction confidence: ", np.max(atk_pred[0]))

Predicted class:  id10002  Actual class:  id10001
Prediction confidence:  1.0


In [40]:
#Show one original example
ipd.Audio(load_and_play(10002)[0,:,0], rate=16000, autoplay=False)

In [41]:
#Show the corresponding adversarial example
ipd.Audio(atk[0,:,0], rate=16000, autoplay=False)

## Carlini&Vagner

### Error Generic

In [51]:
from art.attacks.evasion import CarliniL2Method

binary_search_steps = 1
confidence = 0.5
max_iter = 20
learning_rate = 0.01
initial_const = 1000

attack = CarliniL2Method(classifier=classifier, binary_search_steps=binary_search_steps, confidence=confidence, max_iter=max_iter, learning_rate=learning_rate, initial_const=initial_const, targeted=False)

atk = attack.generate(x)

C&W L_2:   0%|          | 0/1 [00:00<?, ?it/s]

In [52]:
atk_pred = model.predict(atk)
perturbation = np.mean(np.abs((atk - x)))
print('Average perturbation: {:4.2f}\n'.format(perturbation))

#Show attacked sample example
ipd.Audio(atk[0,:,0], rate=16000, autoplay=False)

Average perturbation: 0.00



In [53]:
print("Predicted class: ", labeler.inverse_transform(np.argmax(atk_pred,axis=1))[0], " Actual class: ", file_id)
print("Prediction confidence: ", np.max(atk_pred[0]))

Predicted class:  id10714  Actual class:  id10001
Prediction confidence:  0.99501145


### Error Specific

In [14]:
from art.attacks.evasion import CarliniL2Method

binary_search_steps = 1
confidence = 0.6
max_iter = 50
learning_rate = 0.1
initial_const = 23

attack = CarliniL2Method(classifier=classifier, binary_search_steps=binary_search_steps, confidence=confidence, max_iter=max_iter, learning_rate=learning_rate, initial_const=initial_const, targeted=True)

In [15]:
targeted_labels = 1*np.ones(1)
one_hot_targeted_labels = tf.keras.utils.to_categorical(targeted_labels, num_classes = labeler.classes_.size)

In [16]:
atk = attack.generate(x, one_hot_targeted_labels)

atk_pred = model.predict(atk)
perturbation = np.mean(np.abs((atk - x)))
print('Average perturbation: {:4.2f}\n'.format(perturbation))

#Show attacked sample example
ipd.Audio(atk[0,:,0], rate=16000, autoplay=False)

C&W L_2:   0%|          | 0/1 [00:00<?, ?it/s]

Average perturbation: 0.00



In [20]:
print("Predicted class: ", labeler.inverse_transform(np.argmax(atk_pred,axis=1))[0], " Actual class: ", file_id)
print("Prediction confidence: ", np.max(atk_pred[0]))

Predicted class:  id10001  Actual class:  id10001
Prediction confidence:  0.9999651


In [21]:
#Show one original example
ipd.Audio(load_and_play(10002)[0,:,0], rate=16000, autoplay=False)

In [19]:
#Show the corresponding adversarial example
ipd.Audio(atk[0,:,0], rate=16000, autoplay=False)