# CallAI - Diarization Model (Version III)

***

In [1]:
#For extending the width of the boxes
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:95% !important; }</style>"))

  from IPython.core.display import display, HTML


In [2]:
# library to install: librosa, moviepy, resemblyzer, ffmpeg ( use 'brew install ffmpeg' ), SpeechRecognition, soundfile, SpeechRecognition, sounddevice

In [5]:
# run code below if you haven't install the library above
# !pip install -r lib/requirements.txt

***

# Imports & Installations

In [8]:
import os
import pickle
import warnings
import librosa
import numpy as np

from sklearn.mixture import *
from sklearn.cluster import AgglomerativeClustering
from sklearn.preprocessing import StandardScaler, normalize

from matplotlib import pyplot as plt
import numpy as np
import pandas as pd
import soundfile as sf

from resemblyzer import preprocess_wav, VoiceEncoder

import sys
sys.path.append(os.path.join(os.getcwd(), 'lib'))
from demo_utils import *

os.environ["IMAGEIO_FFMPEG_EXE"] = "/opt/homebrew/bin/ffmpeg"  # depends on where you install ffmpeg, run 'which ffmpeg' to check

from moviepy.video.io.ffmpeg_tools import ffmpeg_extract_subclip

import speech_recognition as sr


In [9]:
!which ffmpeg

/opt/homebrew/bin/ffmpeg


In [10]:
# path = '/Users/quxch/Documents/DEAKIN/2024T2/SIT764/NayyarCode/callai_deployment_edited/callai_deployment'

path = os.getcwd() # to get current path
print(path)

/Users/quxch/Documents/DEAKIN/2024T2/SIT764/AMS-Project/diarization_model


***

# Activity 1 - Build Model

In [11]:
audio_file_name = 'sampleCall1'
audio_file_path = path + f'/data/{audio_file_name}.mp3'
wav_fpath = path +  f'/data/{audio_file_name}.wav'
print(audio_file_path)
print(wav_fpath)


/Users/quxch/Documents/DEAKIN/2024T2/SIT764/AMS-Project/diarization_model/data/sampleCall1.mp3
/Users/quxch/Documents/DEAKIN/2024T2/SIT764/AMS-Project/diarization_model/data/sampleCall1.wav


In [12]:
## Converting mp3 to wav format 
# Load the audio as a waveform `wav` and Store the sampling rate as `sampling_rate`
wav,sampling_rate = librosa.load(audio_file_path, sr=16000)
sf.write(wav_fpath, wav, sampling_rate, 'PCM_16')


In [13]:
# Cut some segments from single speakers as reference audio
# segments = [[6, 10], [11, 13]]
segments = [[0, 7], [8, 19]]  #Can choose at any period in the audio, must match each speaker

speaker_names = ["Agent", "Customer"]
speaker_wavs = [wav[int(s[0] * sampling_rate):int(s[1] * sampling_rate)] for s in segments]

In [14]:
encoder = VoiceEncoder("cpu")
# print("Running the continuous embedding on cpu, this might take a while...")
_, cont_embeds, wav_splits = encoder.embed_utterance(wav, return_partials=True, rate=2)

  checkpoint = torch.load(weights_fpath, map_location="cpu")


Loaded the voice encoder model on cpu in 0.01 seconds.


In [15]:
speaker_embeds = [encoder.embed_utterance(speaker_wav) for speaker_wav in speaker_wavs]

In [16]:
similarity_dict = {name: cont_embeds @ speaker_embed for name, speaker_embed in 
                   zip(speaker_names, speaker_embeds)}

In [17]:
# interactive_diarization(similarity_dict, wav, wav_splits)

times = [((s.start + s.stop) / 2) / sampling_rate for s in wav_splits]
rate = 1 / (times[1] - times[0])
crop_range = int(np.round(x_crop * rate))

In [18]:
times = [((s.start + s.stop) / 2) / sampling_rate for s in wav_splits]

labels = [0 for i in range(0,len(wav_splits))]
labels_conf = [0 for i in range(0,len(wav_splits))]

for i, time in enumerate(times):
    similarities = [s[i] for s in similarity_dict.values()]
    best = np.argmax(similarities)
    name, similarity = list(similarity_dict.keys())[best], similarities[best]
    labels[i] = name
    labels_conf[i] = similarity

In [19]:
labelling = []
start_time = 0

for i, time in enumerate(times):

    similarities = [s[i] for s in similarity_dict.values()]
    best = np.argmax(similarities)
    name, similarity = list(similarity_dict.keys())[best], similarities[best]
    
    # temp = [name, similarity, start_time, time]
    # labelling.append(tuple(temp))
    # start_time = time

    if i > 0 and labels[i] != labels[i-1]:
        temp = [labels[i-1], similarity, start_time, time]
        labelling.append(tuple(temp))
        start_time = time
    if i == len(times) - 1:
        temp = [labels[i], similarity, start_time, time]
        labelling.append(tuple(temp))

In [20]:

for i in range(0, len(wav_splits)):
    similarities = [s[i] for s in similarity_dict.values()]
    best = np.argmax(similarities)
    name, similarity = list(similarity_dict.keys())[best], similarities[best]

    temp = [i, name, similarity]
    # labelling.append(tuple(temp))
    if similarity > 0.75:
       print("Speaker: %s (confident)" % name)
    elif similarity > 0.65:
       print("Speaker: %s (uncertain)" % name)
    else:
       print("Unknown/No speaker")

Speaker: Agent (confident)
Speaker: Agent (confident)
Speaker: Agent (confident)
Speaker: Agent (confident)
Speaker: Agent (confident)
Speaker: Agent (confident)
Speaker: Agent (confident)
Speaker: Agent (confident)
Speaker: Agent (confident)
Speaker: Agent (confident)
Speaker: Agent (confident)
Speaker: Agent (confident)
Speaker: Agent (confident)
Speaker: Customer (uncertain)
Speaker: Customer (confident)
Speaker: Customer (confident)
Speaker: Customer (confident)
Speaker: Customer (confident)
Speaker: Customer (confident)
Speaker: Customer (confident)
Speaker: Customer (confident)
Speaker: Customer (confident)
Speaker: Customer (confident)
Speaker: Customer (confident)
Speaker: Customer (uncertain)
Speaker: Customer (confident)
Speaker: Customer (confident)
Speaker: Customer (confident)
Speaker: Customer (confident)
Speaker: Customer (confident)
Speaker: Customer (confident)
Speaker: Customer (confident)
Speaker: Customer (confident)
Speaker: Customer (confident)
Speaker: Customer (

In [None]:
input_video_path = wav_fpath
output_tmp_path = path + f'/data/tmp/{audio_file_name}_tmp.wav'

output_script = []
output_labels = []

for i in range(0,len(labelling)):

    print(i)
    
    t1 = labelling[i][2]
    t2 = labelling[i][3]
    print(t2-t1)

    ffmpeg_extract_subclip(input_video_path, t1, t2, targetname=output_tmp_path)
    
    file_audio = sr.AudioFile(output_tmp_path)

    # use the audio file as the audio source                                        
    r = sr.Recognizer()
    with file_audio as source:
        audio_text = r.record(source)

    try:
        #recognize the speech  using Google
        output = r.recognize_google(audio_text, language='en-US', show_all=True)
        output_labels.append(labelling[i][0])
        output_script.append(output)
    except sr.UnknownValueError:
        #handle segments that cannot be recognized
        print(f"Segment {i} could not be recognized.")
        output_labels.append(labelling[i][0])
        output_script.append(None)
    except sr.RequestError as e:
        #handle API request errors
        print(f"Could not request results from Google API; {e}")
        break

0
7.3
Moviepy - Running:
>>> "+ " ".join(cmd)
Moviepy - Command successful
Segment 0 could not be recognized.
1
111.5
Moviepy - Running:
>>> "+ " ".join(cmd)
Moviepy - Command successful


In [None]:
# Displaying results
for i in range(len(output_labels)):
    if output_script[i] is None or output_script[i] == []:
        print('{} - Speaker {} : {}'.format(i, output_labels[i], ''))
    else:
        print('{} - Speaker {} : {}'.format(i, output_labels[i], output_script[i]['alternative'][0]['transcript']))


***