# **Install and import necessary libraries**

In [None]:
!pip install --q git+https://github.com/m-bain/whisperx.git
import whisperx
import gc
import pprint

# Pretty print function for inspecting data
pp = pprint.PrettyPrinter(indent=2)

  Preparing metadata (setup.py) ... [?25l[?25hdone


  torchaudio.set_audio_backend("soundfile")


# **Set device and batch size**

In [None]:
device = "cuda"
batch_size = 4  # Reduce if low on GPU memory
compute_type = "float16"  # Change to "int8" if low on GPU memory (may reduce accuracy)


# **Load Audio**

In [52]:
audio_file = "/content/multi.wav"
audio = whisperx.load_audio(audio_file)


# **Load Model**

In [53]:
model = whisperx.load_model("large-v2", device, compute_type=compute_type)


INFO:pytorch_lightning.utilities.migration.utils:Lightning automatically upgraded your loaded checkpoint from v1.5.4 to v2.4.0. To apply the upgrade to your files permanently, run `python -m pytorch_lightning.utilities.upgrade_checkpoint ../root/.cache/torch/whisperx-vad-segmentation.bin`


No language specified, language will be first be detected for each audio file (increases inference time).
Model was trained with pyannote.audio 0.0.1, yours is 3.1.1. Bad things might happen unless you revert pyannote.audio to 0.x.
Model was trained with torch 1.10.0+cu102, yours is 2.3.1+cu121. Bad things might happen unless you revert torch to 1.x.


# **Transcribe the audio**

In [54]:
result = model.transcribe(audio, batch_size=batch_size)
print("Transcription segments (before alignment):")
pp.pprint(result["segments"])


Detected language: en (1.00) in first 30s of audio...
Transcription segments (before alignment):
[ { 'end': 28.507,
    'start': 0.52,
    'text': ' Okay, so you guys are upset because the Collider thing disproved '
            "your theories? It's worse than that. It hasn't found anything in "
            "years, so we don't know if we're right, we don't know if we're "
            "wrong, we don't know where to go next. All I know is it looks "
            'like I tongue-kissed Avatar. Come on, you guys are physicists. '
            "Okay, you're always gonna be physicists. And sure, sometimes the "
            "physics is hard, but isn't that what makes it boring?"},
  { 'end': 58.422,
    'start': 30.265,
    'text': " Hey. We're here. What's going on? Okay, as far as I can see, "
            "science is dead because Leonard killed it. And I don't know who "
            'the Romulans are, but those guys know how to party. So what do '
            "you want us to do? I don't know. Y

# **Align the Whisper output**

In [55]:
model_a, metadata = whisperx.load_align_model(language_code=result["language"], device=device)
result = whisperx.align(result["segments"], model_a, metadata, audio, device, return_char_alignments=False)


# **Initialize the diarization pipeline**

In [56]:
diarize_model = whisperx.DiarizationPipeline(model_name='pyannote/speaker-diarization-3.1',
                                             use_auth_token="Add your key",
                                             device=device)


# **Perform speaker diarization**

In [61]:
diarize_segments = diarize_model(audio, min_speakers=2, max_speakers=10)
print("Diarization segments:")
pp.pprint(diarize_segments)
print("Unique speakers detected:")
print(diarize_segments.speaker.unique())


Diarization segments:
                               segment label     speaker       start  \
0    [ 00:00:00.687 -->  00:00:05.696]     A  SPEAKER_00    0.687606   
1    [ 00:00:05.696 -->  00:00:11.825]     B  SPEAKER_03    5.696095   
2    [ 00:00:12.147 -->  00:00:15.118]     C  SPEAKER_02   12.147708   
3    [ 00:00:17.971 -->  00:00:27.580]     D  SPEAKER_00   17.971138   
4    [ 00:00:28.582 -->  00:00:29.125]     E  SPEAKER_01   28.582343   
..                                 ...   ...         ...         ...   
102  [ 00:04:39.074 -->  00:04:40.144]    CY  SPEAKER_02  279.074703   
103  [ 00:04:42.996 -->  00:04:43.370]    CZ  SPEAKER_00  282.996604   
104  [ 00:04:43.505 -->  00:04:43.539]    DA  SPEAKER_00  283.505942   
105  [ 00:04:45.203 -->  00:04:48.955]    DB  SPEAKER_00  285.203735   
106  [ 00:04:49.006 -->  00:04:49.465]    DC  SPEAKER_00  289.006791   

            end  
0      5.696095  
1     11.825127  
2     15.118846  
3     27.580645  
4     29.125637  
..   

# **Assign speaker IDs to transcription segments**

In [62]:
result = whisperx.assign_word_speakers(diarize_segments, result)
print("Transcription segments with speaker IDs:")
pp.pprint(result["segments"])


Transcription segments with speaker IDs:
[ { 'end': 5.101,
    'speaker': 'SPEAKER_00',
    'start': 0.8,
    'text': ' Okay, so you guys are upset because the Collider thing disproved '
            'your theories?',
    'words': [ { 'end': 1.1,
                 'score': 0.617,
                 'speaker': 'SPEAKER_00',
                 'start': 0.8,
                 'word': 'Okay,'},
               { 'end': 1.26,
                 'score': 0.314,
                 'speaker': 'SPEAKER_00',
                 'start': 1.16,
                 'word': 'so'},
               { 'end': 1.48,
                 'score': 0.764,
                 'speaker': 'SPEAKER_00',
                 'start': 1.34,
                 'word': 'you'},
               { 'end': 1.66,
                 'score': 0.475,
                 'speaker': 'SPEAKER_00',
                 'start': 1.5,
                 'word': 'guys'},
               { 'end': 1.8,
                 'score': 0.751,
                 'speaker': 'SPEAKER_00',


# **Convert and print the conversation style**

In [63]:
import re

# Format the output
conversation = []  # List to hold the formatted conversation
for segment in result["segments"]:
    speaker_id_str = segment.get("speaker", "Unknown")  # Get the speaker ID, default to "Unknown" if not present
    text = segment.get("text", "").strip()  # Get the transcribed text and strip whitespace

    # Extract the numeric part of the speaker ID (e.g., from 'SPEAKER_01' to '01')
    speaker_id_match = re.search(r'(\d+)', speaker_id_str)
    if speaker_id_match:
        speaker_id = int(speaker_id_match.group(1))  # Convert the extracted number to an integer
        # Create a speaker label (e.g., Speaker A, Speaker B)
        speaker_label = f"Speaker {chr(65 + speaker_id)}"  # Converts 0 -> 'A', 1 -> 'B', etc.
        # Append the formatted text to the conversation list
        conversation.append(f"{speaker_label}: {text}")
    else:
        # Handle the case where no valid speaker ID is found, if necessary
        continue

# Convert the conversation list to a single paragraph
paragraph = "\n".join(conversation)

# Print the formatted conversation as a paragraph
print("\nFormatted conversation in paragraph format:")
print(paragraph)



Formatted conversation in paragraph format:
Speaker A: Okay, so you guys are upset because the Collider thing disproved your theories?
Speaker D: It's worse than that.
Speaker D: It hasn't found anything in years, so we don't know if we're right, we don't know if we're wrong, we don't know where to go next.
Speaker C: All I know is it looks like I tongue-kissed Avatar.
Speaker A: Come on, you guys are physicists.
Speaker A: Okay, you're always gonna be physicists.
Speaker A: And sure, sometimes the physics is hard, but isn't that what makes it boring?
Speaker A: Hey.
Speaker E: We're here.
Speaker E: What's going on?
Speaker A: Okay, as far as I can see, science is dead because Leonard killed it.
Speaker A: And I don't know who the Romulans are, but those guys know how to party.
Speaker A: So what do you want us to do?
Speaker A: I don't know.
Speaker A: You're scientists.
Speaker A: Cheer them up.
Speaker E: Cheer them up?
Speaker E: Do you even know what a scientist is?
Speaker D: B