<a href="https://colab.research.google.com/github/pararthdave/NVIDIA-NeMo-Speaker-Diarization/blob/main/nvidia_nemo.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
#First we need to mount google drive.
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
#Installing Nemo toolkit and other requires tools
!apt-get install sox libsndfile1 ffmpeg
## Install NeMo
BRANCH = 'main'
!python -m pip install git+https://github.com/NVIDIA/NeMo.git@$BRANCH#egg=nemo_toolkit[asr]

# Install TorchAudio
!pip install torchaudio>=0.10.0 -f https://download.pytorch.org/whl/torch_stable.html

Reading package lists... Done
Building dependency tree       
Reading state information... Done
libsndfile1 is already the newest version (1.0.28-7ubuntu0.1).
ffmpeg is already the newest version (7:4.2.7-0ubuntu0.1).
The following additional packages will be installed:
  libopencore-amrnb0 libopencore-amrwb0 libsox-fmt-alsa libsox-fmt-base
  libsox3
Suggested packages:
  libsox-fmt-all
The following NEW packages will be installed:
  libopencore-amrnb0 libopencore-amrwb0 libsox-fmt-alsa libsox-fmt-base
  libsox3 sox
0 upgraded, 6 newly installed, 0 to remove and 24 not upgraded.
Need to get 513 kB of archives.
After this operation, 1,564 kB of additional disk space will be used.
Get:1 http://archive.ubuntu.com/ubuntu focal/universe amd64 libopencore-amrnb0 amd64 0.1.5-1 [94.8 kB]
Get:2 http://archive.ubuntu.com/ubuntu focal/universe amd64 libopencore-amrwb0 amd64 0.1.5-1 [49.1 kB]
Get:3 http://archive.ubuntu.com/ubuntu focal-updates/universe amd64 libsox3 amd64 14.4.2+git20190427-2+deb

In [None]:
# Importing necessary libraries
import glob
import os
import wave
import contextlib
import subprocess
import numpy as np
from numpy.linalg import norm
from nemo.collections.asr.parts.utils.speaker_utils import embedding_normalize
from tqdm import  tqdm
try:
    from torch.cuda.amp import autocast
except ImportError:
    from contextlib import contextmanager

    @contextmanager
    def autocast(enabled=None):
        yield
import json
import pickle as pkl
import torch
import nemo
import nemo.collections.asr as nemo_asr
from omegaconf import OmegaConf
import time


[NeMo W 2023-05-16 18:27:53 optimizers:54] Apex was not found. Using the lamb or fused_adam optimizer will error out.
[NeMo W 2023-05-16 18:27:56 experimental:27] Module <class 'nemo.collections.asr.modules.audio_modules.SpectrogramToMultichannelFeatures'> is experimental, not ready for production and is not fully supported. Use at your own risk.


In [None]:
#This function creates manifest which contains wave file path, duration and, speaker label.
#Speaker label is kept as dummy just to maintain the manifest format and will be not used for identification. 
#The generated manifest is used further to obtain the speaker embeddings.
def create_manifest_identification(wave_path,output_manifest_path):
  output = open(output_manifest_path,"w")
      # print(line1)
  main_dict = dict()
  with contextlib.closing(wave.open(wave_path,'r')) as f:
    frames = f.getnframes()
    rate = f.getframerate()
    dur = frames / float(rate)
  main_dict["audio_filepath"] = wave_path
  main_dict["duration"] = dur
  main_dict["label"] = "dummy_id"
  print(main_dict)
  output.write(json.dumps(main_dict))
  output.write('\n')
  output.close()

In [None]:
from torch.nn.modules import linear
# This function takes speaker model, manifest file path (generated by create_manifest_identification()/create_manifest_enroll_verfication() function)
# and embedding directory path (where embedding wil be saved) as must required arguments. 
#batch size and device can be passed according to the preference. It returns speaker embedding path (.pkl file) 
def obtain_embeddings(speaker_model, manifest_file,  embedding_dir, batch_size=1, device='cpu'):
      test_config = OmegaConf.create(
          dict(
              manifest_filepath=manifest_file,
              sample_rate=16000,
              labels=None,
              batch_size=batch_size,
              shuffle=False,
              time_length=20,
          )
      )

      speaker_model.setup_test_data(test_config)
      speaker_model = speaker_model.to(device)
      speaker_model.eval()

      all_embs=[]
      out_embeddings = {}
            
      for test_bat in tqdm(speaker_model.test_dataloader()):
          test_bat = [x.to(device) for x in test_bat]
          aud_signal, aud_signal_len, labels, slices = test_bat
          with autocast():
              _, embs = speaker_model.forward(input_signal=aud_signal, input_signal_length=aud_signal_len)
              emb_shape = embs.shape[-1]
              embs = embs.view(-1, emb_shape)
              all_embs.extend(embs.cpu().detach().numpy())
          del test_bat

      all_embs = np.asarray(all_embs)
      if len(all_embs)>1:
        all_embs = embedding_normalize(all_embs)
      with open(manifest_file, 'r') as manifest:
          for i, line in enumerate(manifest.readlines()):
              line = line.strip()
              dic = json.loads(line)
              uniq_name = dic['label']
              if uniq_name in out_embeddings.keys():

                pre_emb = out_embeddings[uniq_name]
                avg_emb = (pre_emb+all_embs[i])/2  #if enrollment already present averaging out it with new embeddings
                out_embeddings[uniq_name]=avg_emb

              else:
                out_embeddings[uniq_name] = all_embs[i] #adding new enrollment if not already present
              

      embedding_dir = os.path.join(embedding_dir, 'embeddings')
      if not os.path.exists(embedding_dir):
          os.makedirs(embedding_dir, exist_ok=True)
      prefix = manifest_file.split('/')[-1].rsplit('.', 1)[-2]
      name = os.path.join(embedding_dir, prefix)
      embeddings_file = name + '_embeddings.pkl'
      pkl.dump(out_embeddings, open(embeddings_file, 'wb'))
      return embeddings_file

In [None]:
#This fuction returns consine similarity score for two embedding passed.
def cosin_sim(A,B):
  cosine = np.dot(A,B)/(norm(A)*norm(B))
  return cosine

In [None]:
#This function is the overall wrapper for speaker identification.
#It take 3 arguments i.e. audio path, speaker_model (Inisialized in previous step) and, already enrolled embedding path.
#it creates a tmp dir as working directy to save manifest and speaker embeddings. 
def identification(audio_path,speaker_model,enrolled_emb_path,threshould=0.4):
  start_time = time.time()
  output_dict = dict()
  output_dir = "/content/drive/MyDrive/nvidia-nemo-voxceleb/tmp/" #Please pass the apropriate path for tmp folder
  if not os.path.exists(output_dir):
    os.mkdir(output_dir)
  output_manifest_path = output_dir + "identify_manifest.json"
  embedding_dir = output_dir
  create_manifest_identification(audio_path,output_manifest_path)
  spk_emb_path = obtain_embeddings(speaker_model, output_manifest_path,embedding_dir,batch_size=1, device='cpu')
  identify_emb_pkl=pkl.load(open(spk_emb_path, "rb"))
  enrolled_emb_pkl=pkl.load(open(enrolled_emb_path, "rb"))
  higest_score = 0
  speaker_name = None
  for key in identify_emb_pkl:
    ver_emb=identify_emb_pkl[key]
    verify_key=key
    for key in enrolled_emb_pkl:
      emb = enrolled_emb_pkl[key]
      speaker = key
      score = cosin_sim(emb,ver_emb)



      if score > higest_score:
        if score > threshould:
          # print("speaker:",speaker)
          speaker_name = speaker
          higest_score = score
          actual_speaker = verify_key

  if higest_score==0:
    output_dict["status"] = "We could not verify"
    # print("We could not verify")
  else:
    # print("Identification done")
    output_dict["status"] = "Identification done"
    # print("Speaker name:", speaker_name , " Score: ",higest_score)
    output_dict["Speaker_name"] = speaker_name
    output_dict["score"] = higest_score

  end_time = time.time()
  time_elapsed = end_time - start_time
  # print ("Time taken: ",time_elapsed)
  output_dict["Time_taken"] = time_elapsed
  print("output: ",output_dict)

In [None]:
#Setting up the NeMo pretrained speaker model to be used to generate the speaker embeddings
#Note: The enrollment is done using Nemo pretained titanet_large speaker model.
spk_model = nemo_asr.models.EncDecSpeakerLabelModel.from_pretrained(model_name='titanet_large')

[NeMo I 2023-05-16 18:51:58 cloud:58] Found existing object /root/.cache/torch/NeMo/NeMo_1.19.0rc0/titanet-l/11ba0924fdf87c049e339adbf6899d48/titanet-l.nemo.
[NeMo I 2023-05-16 18:51:58 cloud:64] Re-using file from: /root/.cache/torch/NeMo/NeMo_1.19.0rc0/titanet-l/11ba0924fdf87c049e339adbf6899d48/titanet-l.nemo
[NeMo I 2023-05-16 18:51:58 common:913] Instantiating model from pre-trained checkpoint


[NeMo W 2023-05-16 18:51:58 modelPT:161] If you intend to do training or fine-tuning, please call the ModelPT.setup_training_data() method and provide a valid configuration file to setup the train data loader.
    Train config : 
    manifest_filepath: /manifests/combined_fisher_swbd_voxceleb12_librispeech/train.json
    sample_rate: 16000
    labels: null
    batch_size: 64
    shuffle: true
    is_tarred: false
    tarred_audio_filepaths: null
    tarred_shard_strategy: scatter
    augmentor:
      noise:
        manifest_path: /manifests/noise/rir_noise_manifest.json
        prob: 0.5
        min_snr_db: 0
        max_snr_db: 15
      speed:
        prob: 0.5
        sr: 16000
        resample_type: kaiser_fast
        min_speed_rate: 0.95
        max_speed_rate: 1.05
    num_workers: 15
    pin_memory: true
    
[NeMo W 2023-05-16 18:51:58 modelPT:168] If you intend to do validation, please call the ModelPT.setup_validation_data() or ModelPT.setup_multiple_validation_data() method 

[NeMo I 2023-05-16 18:51:58 features:291] PADDING: 16
[NeMo I 2023-05-16 18:51:58 save_restore_connector:249] Model EncDecSpeakerLabelModel was successfully restored from /root/.cache/torch/NeMo/NeMo_1.19.0rc0/titanet-l/11ba0924fdf87c049e339adbf6899d48/titanet-l.nemo.


In [None]:
import time
#audio path for identification
audio_path = "/content/drive/MyDrive/nvidia-nemo-voxceleb/id10289.wav" #Please pass the apropriate path
# Enrolled embedding path
enrolled_emb_path = "/content/drive/MyDrive/nvidia-nemo-voxceleb/data/embeddings/train_6_speaker_embeddings.pkl" #Please pass the apropriate path
threshold = 0.4 # Set threshould value
# call identification function and passing the arugments
identification(audio_path,spk_model,enrolled_emb_path,threshold)

{'audio_filepath': '/content/drive/MyDrive/nvidia-nemo-voxceleb/id10289.wav', 'duration': 7.2800625, 'label': 'dummy_id'}
[NeMo I 2023-05-16 18:52:00 collections:298] Filtered duration for loading collection is  0.00 hours.
[NeMo I 2023-05-16 18:52:00 collections:299] Dataset loaded with 1 items, total duration of  0.00 hours.
[NeMo I 2023-05-16 18:52:00 collections:301] # 1 files loaded accounting to # 1 labels


100%|██████████| 1/1 [00:00<00:00,  1.85it/s]

/content/drive/MyDrive/nvidia-nemo-voxceleb/tmp/identify_manifest.json
identify_manifest
output:  {'status': 'Identification done', 'Speaker_name': 'id10289', 'score': 0.85525507, 'Time_taken': 0.600888729095459}



