# Audio Visual Embedding based face recognition

In [1]:
from facenet_pytorch import MTCNN, InceptionResnetV1
from PIL import Image
import torch
import torch.nn.functional as F
import os
import matplotlib.pyplot as plt

from pyannote.audio import Model
from pyannote.audio import Inference
from scipy.spatial.distance import cdist
import os

import numpy as np
import random


  from .autonotebook import tqdm as notebook_tqdm


In [6]:
directory = 'VIDTIMIT'  # Replace with the path to your directory

labels = [name for name in os.listdir(directory) if os.path.isdir(os.path.join(directory, name))]

mtcnn = MTCNN()
resnet = InceptionResnetV1(pretrained='vggface2').eval()

model = Model.from_pretrained("pyannote/embedding", use_auth_token="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")
inference = Inference(model, window="whole")
print("-"*50)
print("-"*50)
print("Total Persons : ", len(labels))
print("-"*50)
print("-"*50)


Lightning automatically upgraded your loaded checkpoint from v1.2.7 to v2.0.2. To apply the upgrade to your files permanently, run `python -m pytorch_lightning.utilities.upgrade_checkpoint --file ../.cache/torch/pyannote/models--pyannote--embedding/snapshots/20b2db779562a3141f5eadd34a0232dbcd56d620/pytorch_model.bin`
Lightning automatically upgraded your loaded checkpoint from v1.2.7 to v2.0.2. To apply the upgrade to your files permanently, run `python -m pytorch_lightning.utilities.upgrade_checkpoint --file ../.cache/torch/pyannote/models--pyannote--embedding/snapshots/20b2db779562a3141f5eadd34a0232dbcd56d620/pytorch_model.bin`


Model was trained with pyannote.audio 0.0.1, yours is 2.1.1. Bad things might happen unless you revert pyannote.audio to 0.x.
Model was trained with torch 1.8.1+cu102, yours is 2.0.1+cu117. Bad things might happen unless you revert torch to 1.x.
Model was trained with pyannote.audio 0.0.1, yours is 2.1.1. Bad things might happen unless you revert pyannote.audio to 0.x.
Model was trained with torch 1.8.1+cu102, yours is 2.0.1+cu117. Bad things might happen unless you revert torch to 1.x.
--------------------------------------------------
--------------------------------------------------
Total Persons :  43
--------------------------------------------------
--------------------------------------------------


# Store Audio and Video Embeddings

In [7]:
directory = 'VIDTIMIT'
labels = [name for name in os.listdir(directory) if os.path.isdir(os.path.join(directory, name))]

embeddings_audio = {}
embeddings_face = {}


for label in labels:
    face_path = f"{directory}/{label}/video/head3/004"
    img = Image.open(face_path)
    img_cropped = mtcnn(img)

    try:
        img_embedding = resnet(img_cropped.unsqueeze(0))
    except Exception:
        print(img_cropped, label, img)
        plt.imshow(img)
        break

    embeddings_face[label] = img_embedding

    try:
        audio_path = f"{directory}/{label}/audio/sa1.wav"
        embedding = inference(audio_path).reshape(1,512)
        embeddings_audio[label] = embedding
    except Exception:
        print(label, " Error in audio embedding extraction")
        

# Fusing Face and Audio embeddings

# Frontal pose : 

Intial accuracy
1. Face Accuracy 100%
2. Audio recognition accuracy 100%

In [20]:
correct_detection_count = 0
for label in labels:
    
    # extract embedding 
    audio_emd = inference(f"{directory}/{label}/audio/sa2.wav").reshape(1,512)
    img_path = f"{directory}/{label}/video/head2/001"
    img = Image.open(img_path)
    img_cropped = mtcnn(img)
    img_embedding = resnet(img_cropped.unsqueeze(0))
    
    label_idx = []
    audio_sim = []
    face_sim = []
    for test_label in labels:
        # Audio similarity
        distance = cdist(audio_emd, embeddings_audio[test_label], metric="cosine")
        audio_sim.append(1- distance[0])
        # Face similarity
        cos_similarity = F.cosine_similarity(img_embedding, embeddings_face[test_label])
        face_sim.append(float(cos_similarity))
    
    audio_sim = np.array(audio_sim).flatten()/max(audio_sim)
    face_sim = np.array(face_sim).flatten()/max(face_sim)
    
    similarity = face_sim*0.5 + audio_sim*0.5
    match_label = labels[np.argmax(similarity)]
        
    if match_label == label:
        correct_detection_count += 1
accuracy = correct_detection_count/ 43
print("-"*50)
print("-"*50)
print("Accuracy is :" , accuracy*100 , "%" )  
print("-"*50)
print("-"*50)    



--------------------------------------------------
--------------------------------------------------
Accuracy is : 100.0 %
--------------------------------------------------
--------------------------------------------------


# Side pose : 

Intial accuracy
1. Face Accuracy 95.6%
2. Audio recognition accuracy 100%

In [21]:
correct_detection_count = 0
for label in labels:
    
    # extract embedding 
    audio_emd = inference(f"{directory}/{label}/audio/sa2.wav").reshape(1,512)
    img_path = f"{directory}/{label}/video/head2/025"
    img = Image.open(img_path)
    img_cropped = mtcnn(img)
    img_embedding = resnet(img_cropped.unsqueeze(0))
    
    label_idx = []
    audio_sim = []
    face_sim = []
    for test_label in labels:
        # Audio similarity
        distance = cdist(audio_emd, embeddings_audio[test_label], metric="cosine")
        audio_sim.append(1- distance[0])
        # Face similarity
        cos_similarity = F.cosine_similarity(img_embedding, embeddings_face[test_label])
        face_sim.append(float(cos_similarity))
    
    audio_sim = np.array(audio_sim).flatten()/max(audio_sim)
    face_sim = np.array(face_sim).flatten()/max(face_sim)
    
    similarity = face_sim*0.5 + audio_sim*0.5
    match_label = labels[np.argmax(similarity)]
        
    if match_label == label:
        correct_detection_count += 1
accuracy = correct_detection_count/ 43
print("-"*50)
print("-"*50)
print("Accuracy is :" , accuracy*100 , "%" )  
print("-"*50)
print("-"*50)    


--------------------------------------------------
--------------------------------------------------
Accuracy is : 100.0 %
--------------------------------------------------
--------------------------------------------------


# Side Pose with Noisy Audio

In [32]:
correct_detection_count = 0
directory = "VIDTIMIT"
for label in labels:
    
    # extract embedding 
    dir = f"{directory}/{label}/audio/"
    audio_list = os.listdir(dir)
    file_list = [file for file in audio_list if os.path.isfile(os.path.join(dir, file))]
    audio_wav = f"{dir}/{random.choice(file_list)}"
    audio_emd = inference(audio_wav).reshape(1,512)
    
    img_path = f"{directory}/{label}/video/head2/025"
    img = Image.open(img_path)
    img_cropped = mtcnn(img)
    img_embedding = resnet(img_cropped.unsqueeze(0))
    
    label_idx = []
    audio_sim = []
    face_sim = []
    for test_label in labels:
        # Audio similarity
        distance = cdist(audio_emd, embeddings_audio[test_label], metric="cosine")
        audio_sim.append(1- distance[0])
        # Face similarity
        cos_similarity = F.cosine_similarity(img_embedding, embeddings_face[test_label])
        face_sim.append(float(cos_similarity))
    
    audio_sim = np.array(audio_sim).flatten()/max(audio_sim)
    face_sim = np.array(face_sim).flatten()/max(face_sim)
    
    similarity = face_sim*0.5 + audio_sim*0.5
    match_label = labels[np.argmax(similarity)]
        
    if match_label == label:
        correct_detection_count += 1
accuracy = correct_detection_count/ 43
print("-"*50)
print("-"*50)
print("Accuracy is :" , accuracy*100 , "%" )  
print("-"*50)
print("-"*50)    


--------------------------------------------------
--------------------------------------------------
Accuracy is : 100.0 %
--------------------------------------------------
--------------------------------------------------


# Observations

1. As can be seen, Even with noisy audio and side face, accuracy is 100% when audio and video both methods are used