In [None]:
from google.colab import files  # google.colab is a library provided by google to run codes in jupyter nb environment. so from google.colab we are importing files module.
uploaded = files.upload() # Used to upload file from local machine to colab. it calls the upload function from the files module and assigns it to variable called uploaded
path = next(iter(uploaded)) # retrieves the first file uploaded by the user from the "uploaded" dictionary which is assigned to variable "path"

Saving WhatsApp Audio 2023-07-03 at 2.27.40 AM.ogg to WhatsApp Audio 2023-07-03 at 2.27.40 AM.ogg


In [None]:
num_speakers = 2 #@param {type:"integer"} ,
 # We specify the number of speaker as we ar using a clustering model in this and in clustering we need to specify the number of clusters .
language = 'English' #@param ['any', 'English']
# if the speakers are speakin in only english then this would give us advantage over processing time so if the audio is just in english we should specify . But we can use "any" if the audio is in multiple languages
model_size = 'medium' #@param ['tiny', 'base', 'small', 'medium', 'large']
# this is done to specify the accuracy level . We can also use tiny but the accuracy will be less so by default we use medium . We can use large as well but it will take alot of processing time.

model_name = model_size
if language == 'English' and model_size != 'large': #This line starts an if statement that checks if the language variable is set to 'English' and the model_size variable is not set to 'large'. If both conditions are true, the code block inside the if statement will be executed.
  model_name += '.en' #appends the string '.en' to the model_name variable. It adds '.en' to the end of the model_name. This is done to indicate that the English language should be used with models other than the 'large' size.


In [None]:
# !pip install whisper

In [None]:
!pip install -q git+https://github.com/openai/whisper.git > /dev/null # installing whisper

!pip install -q git+https://github.com/pyannote/pyannote-audio > /dev/null # installing pyannote

import whisper
import datetime

import subprocess

import torch
import pyannote.audio
from pyannote.audio.pipelines.speaker_verification import PretrainedSpeakerEmbedding # this was developed by SpeechBrain company and from this we are using speech verification  with ECAPA-TDNN embeddings . it is used to extract speech embeddings from the speakers voice .
embedding_model = PretrainedSpeakerEmbedding(
    "speechbrain/spkrec-ecapa-voxceleb",
    device=torch.device("cuda"))

from pyannote.audio import Audio
from pyannote.core import Segment

import wave
import contextlib

from sklearn.cluster import AgglomerativeClustering
import numpy as np

Downloading (…)ain/hyperparams.yaml:   0%|          | 0.00/1.92k [00:00<?, ?B/s]

Downloading embedding_model.ckpt:   0%|          | 0.00/83.3M [00:00<?, ?B/s]

Downloading (…)an_var_norm_emb.ckpt:   0%|          | 0.00/1.92k [00:00<?, ?B/s]

Downloading classifier.ckpt:   0%|          | 0.00/5.53M [00:00<?, ?B/s]

Downloading (…)in/label_encoder.txt:   0%|          | 0.00/129k [00:00<?, ?B/s]

In [None]:
# !pip install pyannote

In [None]:
if path[-3:] != 'wav':
  subprocess.call(['ffmpeg', '-i', path, 'audio.wav', '-y'])
  path = 'audio.wav'


# this code specifies if the audio data is inn .wav form or not . If its not in .wav format convert into .wav format using 'ffmpeg'.

In [None]:
model = whisper.load_model(model_size) # we load the whisper model for transcription

100%|██████████████████████████████████████| 1.42G/1.42G [00:05<00:00, 255MiB/s]


In [None]:
result = model.transcribe(path) # once the model is loaded we give it the path of the audio file
segments = result["segments"] # and the result is in indiviual segments where we'll get the timestamp as well . hence, its a dictionary containing information about the start and end timestamps of the segment

In [None]:
with contextlib.closing(wave.open(path,'r')) as f: # using context.lib it specifies the .wav file to open ."r" - ensures that the file is in read only mode. and contextlib.closing() function ensures that the file closed after it is read .The opened file object is assigned to the variable 'f'.
  frames = f.getnframes() # returns the total number of audio files in frames
  rate = f.getframerate() #returns the sample rate of the audio file, which is the number of frames per second. It indicates how many audio samples are captured or played back per second.
  duration = frames / float(rate) #calculates the duration of the audio file by dividing the total number of frames (frames) by the sample rate (rate). The result is assigned to the variable duration. By dividing frames by the sample rate, we obtain the duration of the audio file in seconds.

In [None]:
audio = Audio() # calls the library to deal with audio data

def segment_embedding(segment): #segment_embedding that takes a segment of audio data, extracts the waveform for that segment, and passes it through an embedding_model to obtain an embedding
  start = segment["start"] # retrieves the value associated with the key "start" from the segment dictionary and assigns it to the variable start
  # Whisper overshoots the end timestamp in the last segment
  end = min(duration, segment["end"]) #The min() function is then used to calculate the minimum value between duration (presumably the duration of the audio file) and the retrieved "end" value. The resulting minimum value is assigned to the variable end. This ensures that the end timestamp does not exceed the duration of the audio file.
  clip = Segment(start, end) # creates an instance of the Segment class, passing start and end as arguments.
  waveform, sample_rate = audio.crop(path, clip) #calls the crop() method of the audio object, passing the path and clip (the segment) as arguments.
  return embedding_model(waveform[None]) #waveform[None] is a way to add an additional dimension to the waveform array. The resulting embedding is returned as the result of the segment_embedding function.

In [None]:
embeddings = np.zeros(shape=(len(segments), 192)) # creates a numpy array called embeddings with a shape of (len(segments), 192). The array is initialized with zeros and will be used to store the embeddings for each segment of audio. Each embedding has a dimension of 192.
for i, segment in enumerate(segments): #The enumerate() function is used to get both the index (i) and the corresponding segment (segment) from the list.
  embeddings[i] = segment_embedding(segment) #calls the segment_embedding function with the current segment as an argument and assigns the resulting embedding to the i-th row of the embeddings array.

embeddings = np.nan_to_num(embeddings) #uses the np.nan_to_num() function from numpy to replace any NaN (Not a Number) values in the embeddings array with zeros. This ensures that the array contains only numerical values.

In [None]:
clustering = AgglomerativeClustering(num_speakers).fit(embeddings) #performs agglomerative clustering on the embeddings array. The num_speakers variable specifies the desired number of clusters.
labels = clustering.labels_ #retrieves the labels assigned by the clustering algorithm to each sample in the embeddings array. The labels represent the cluster assignments for each segment.
for i in range(len(segments)):
  segments[i]["speaker"] = 'SPEAKER ' + str(labels[i] + 1) #assigns a speaker label to each segment in the segments list based on the corresponding cluster label (labels[i]) obtained from the clustering algorithm. The speaker label is assigned as a string in the format 'SPEAKER <label>', where <label> is the cluster label plus one.

In [None]:
def time(secs): #t takes a parameter secs
  return datetime.timedelta(seconds=round(secs)) #returns a timedelta object from the datetime module, representing the given number of seconds (secs). The seconds are rounded to the nearest whole number before conversion.

f = open("transcript.txt", "w") # opens a file named "transcript.txt" in write mode and assigns the file object to the variable f

for (i, segment) in enumerate(segments):
  if i == 0 or segments[i - 1]["speaker"] != segment["speaker"]:#checks if the current segment is the first segment (i == 0) or if the speaker of the current segment is different from the speaker of the previous segment (segments[i - 1]["speaker"] != segment["speaker"]). It is used to identify speaker transitions.
    f.write("\n" + segment["speaker"] + ' ' + str(time(segment["start"])) + '\n') # writes a new line character ("\n"), followed by the speaker label (segment["speaker"]), the start time of the segment formatted as a string (str(time(segment["start"]))), and another new line character ("\n") to the file f.
  f.write(segment["text"][1:] + ' ') #writes the text associated with the current segment (segment["text"]) to the file f. The [1:] indexing is used to exclude the first character of the text
f.close() # closes the file f to ensure that any buffered data is written to the file.

In [None]:
print(open('transcript.txt','r').read()) #ens the file "transcript.txt" in read mode, reads its contents using the read() method, and prints the contents


SPEAKER 1 0:00:00
Hello, how are you? I'm good, how are you? 
SPEAKER 2 0:00:04
I'm good, thank you. 
