# Pynnote-Whisper model for AMS

In [1]:
! pip install git+https://github.com/openai/whisper.git
! pip install -q git+https://github.com/pyannote/pyannote-audio

Collecting git+https://github.com/openai/whisper.git
  Cloning https://github.com/openai/whisper.git to /tmp/pip-req-build-xyb_zvx7
  Running command git clone --filter=blob:none --quiet https://github.com/openai/whisper.git /tmp/pip-req-build-xyb_zvx7
  Resolved https://github.com/openai/whisper.git to commit ba3f3cd54b0e5b8ce1ab3de13e32122d0d5f98ab
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting tiktoken (from openai-whisper==20231117)
  Downloading tiktoken-0.7.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.6 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch->openai-whisper==20231117)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch->openai-whisper==20231117)
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-n

In [2]:
import os, sys
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [34]:
path = '/content/drive/MyDrive/sampleCall1.wav'# upload the sample to google driver
#path = os.getcwd() # to get current path
print(path)

/content/drive/MyDrive/sampleCall1.wav


In [35]:
# Import necessary libraries
import whisper
from pyannote.audio import Pipeline
from pyannote.core import Segment, Annotation, Timeline

# Define helper functions
def get_text_with_timestamp(transcribe_res):
    timestamp_texts = []
    for item in transcribe_res['segments']:
        start = item['start']
        end = item['end']
        text = item['text']
        timestamp_texts.append((Segment(start, end), text))
    return timestamp_texts

def add_speaker_info_to_text(timestamp_texts, ann):
    spk_text = []
    for seg, text in timestamp_texts:
        spk = ann.crop(seg).argmax()
        spk_text.append((seg, spk, text))
    return spk_text

def merge_cache(text_cache):
    sentence = ''.join([item[-1] for item in text_cache])
    spk = text_cache[0][1]
    start = text_cache[0][0].start
    end = text_cache[-1][0].end
    return Segment(start, end), spk, sentence

PUNC_SENT_END = ['.', '?', '!']

def merge_sentence(spk_text):
    merged_spk_text = []
    pre_spk = None
    text_cache = []
    for seg, spk, text in spk_text:
        if spk != pre_spk and pre_spk is not None and len(text_cache) > 0:
            merged_spk_text.append(merge_cache(text_cache))
            text_cache = [(seg, spk, text)]
            pre_spk = spk
        elif text and len(text) > 0 and text[-1] in PUNC_SENT_END:
            text_cache.append((seg, spk, text))
            merged_spk_text.append(merge_cache(text_cache))
            text_cache = []
            pre_spk = spk
        else:
            text_cache.append((seg, spk, text))
            pre_spk = spk
    if len(text_cache) > 0:
        merged_spk_text.append(merge_cache(text_cache))
    return merged_spk_text

def diarize_text(transcribe_res, diarization_result):
    timestamp_texts = get_text_with_timestamp(transcribe_res)
    spk_text = add_speaker_info_to_text(timestamp_texts, diarization_result)
    res_processed = merge_sentence(spk_text)
    return res_processed

In [36]:
import whisper
from pyannote.audio import Pipeline
from pyannote.core import Segment
import csv

# Takes 6 minutes to run a audio file
# Define helper functions
def get_text_with_timestamp(transcribe_res):
    timestamp_texts = []
    for item in transcribe_res['segments']:
        start = item['start']
        end = item['end']
        text = item['text']
        timestamp_texts.append((Segment(start, end), text))
    return timestamp_texts

def add_speaker_info_to_text(timestamp_texts, ann):
    spk_text = []
    for seg, text in timestamp_texts:
        spk = ann.crop(seg).argmax()
        spk_text.append((seg, spk, text))
    return spk_text

def merge_cache(text_cache):
    sentence = ''.join([item[-1] for item in text_cache])
    spk = text_cache[0][1]
    start = text_cache[0][0].start
    end = text_cache[-1][0].end
    return Segment(start, end), spk, sentence

PUNC_SENT_END = ['.', '?', '!']

def merge_sentence(spk_text):
    merged_spk_text = []
    pre_spk = None
    text_cache = []
    for seg, spk, text in spk_text:
        if spk != pre_spk and pre_spk is not None and len(text_cache) > 0:
            merged_spk_text.append(merge_cache(text_cache))
            text_cache = [(seg, spk, text)]
            pre_spk = spk
        elif text and len(text) > 0 and text[-1] in PUNC_SENT_END:
            text_cache.append((seg, spk, text))
            merged_spk_text.append(merge_cache(text_cache))
            text_cache = []
            pre_spk = spk
        else:
            text_cache.append((seg, spk, text))
            pre_spk = spk
    if len(text_cache) > 0:
        merged_spk_text.append(merge_cache(text_cache))
    return merged_spk_text

def diarize_text(transcribe_res, diarization_result):
    timestamp_texts = get_text_with_timestamp(transcribe_res)
    spk_text = add_speaker_info_to_text(timestamp_texts, diarization_result)
    res_processed = merge_sentence(spk_text)
    return res_processed

# Main processing workflow
def process_audio(audio_file, auth_token):
    # Initialize pipeline and model
    print("Initializing models...")
    try:
        pipeline = Pipeline.from_pretrained("pyannote/speaker-diarization-3.1",
                                            use_auth_token=auth_token)
        model = whisper.load_model("tiny.en")
    except Exception as e:
        print(f"Error during model initialization: {e}")
        return None

    # Perform transcription and speaker diarization
    print("Processing audio for transcription and diarization...")
    try:
        asr_result = model.transcribe(audio_file)
        diarization_result = pipeline(audio_file)
    except Exception as e:
        print(f"Error during processing: {e}")
        return None

    # Merge results
    print("Merging transcription and diarization results...")
    final_result = diarize_text(asr_result, diarization_result)

    if not final_result:
        print("No results obtained after merging.")
    else:
        print("Processing complete.")

    return final_result

# Usage example
audio_file = path # Update with your actual file path
auth_token = "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"  # Set up your token in Hugging Face
output_csv = "/content/drive/MyDrive/output.csv"  # Specify the CSV file path

# Process the audio and get the results
result = process_audio(audio_file, auth_token)

# Check if the result is valid before attempting to print and save
if result:
    print("Saving the final result to CSV...")
    try:
        with open(output_csv, mode='w', newline='') as file:
            writer = csv.writer(file)
            writer.writerow(["Start Time", "End Time", "Speaker", "Text"])
            for seg, spk, sent in result:
                line = f'{seg.start:.2f} {seg.end:.2f} {spk} {sent}'
                print(line)
                writer.writerow([f'{seg.start:.2f}', f'{seg.end:.2f}', spk, sent])
        print(f"Results successfully saved to {output_csv}")
    except Exception as e:
        print(f"Error saving CSV file: {e}")
else:
    print("No result to print or save.")

Initializing models...
Processing audio for transcription and diarization...
Merging transcription and diarization results...
Processing complete.
Saving the final result to CSV...
0.00 9.00 SPEAKER_00  Thank you for calling Nissan.
9.00 10.00 SPEAKER_00  My name is Lauren.
10.00 11.00 SPEAKER_00  Can I have your name?
11.00 13.00 SPEAKER_01  My name is John Smith.
13.00 14.00 SPEAKER_00  Thank you, John.
14.00 15.00 SPEAKER_00  How can I help you?
15.00 20.00 SPEAKER_01  I was just calling about to see how much it would cost to update the map in my car.
20.00 22.00 SPEAKER_00  I'd be happy to help you with that today.
22.00 24.00 SPEAKER_00  Did you receive a mailer from us?
24.00 25.00 SPEAKER_01  I did.
25.00 26.00 SPEAKER_01  Do you need the customer number?
26.00 27.00 SPEAKER_00  Yes, please.
27.00 28.00 SPEAKER_01  Okay.
28.00 30.00 SPEAKER_01  It's 1-5-2-4-3.
30.00 31.00 SPEAKER_01  Thank you.
31.00 33.00 SPEAKER_00  And the year making model of your vehicle?
33.00 36.00 SPEAKE

In [13]:
!pip install rouge_score

Collecting rouge_score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: rouge_score
  Building wheel for rouge_score (setup.py) ... [?25l[?25hdone
  Created wheel for rouge_score: filename=rouge_score-0.1.2-py3-none-any.whl size=24935 sha256=f294e402eed5557d016d84620fb942b0d15e55223b83da2926d0aa638a41bcf5
  Stored in directory: /root/.cache/pip/wheels/5f/dd/89/461065a73be61a532ff8599a28e9beef17985c9e9c31e541b4
Successfully built rouge_score
Installing collected packages: rouge_score
Successfully installed rouge_score-0.1.2


In [41]:
import csv
from rouge_score import rouge_scorer

# Function to read a CSV file and return lists of text and speaker information
def read_csv(file_path):
    texts = []
    speakers = []
    with open(file_path, mode='r') as file:
        reader = csv.reader(file)
        next(reader)  # Skip the header row
        for row in reader:
            start_time = float(row[0])  # Convert start time to float
            end_time = float(row[1])    # Convert end time to float
            speaker = row[2]            # Keep speaker as a string
            text = row[3]               # Extract text content
            speakers.append(speaker)   # Append speaker information
            texts.append(text)         # Append text content
    return texts, speakers

# Function to compare speaker segmentation between machine-generated and human-transcribed data
def compare_speakers(machine_speakers, human_speakers):
    if len(machine_speakers) != len(human_speakers):
        raise ValueError("Length of machine speakers and human speakers lists do not match.")

    correct = 0
    total = len(machine_speakers)

    for m_spk, h_spk in zip(machine_speakers, human_speakers):
        if m_spk == h_spk:
            correct += 1

    accuracy = correct / total if total > 0 else 0
    print(f"Speaker Segmentation Accuracy: {accuracy:.4f}")

# Function to calculate ROUGE scores and display results
def calculate_rouge(machine_texts, human_texts):
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)

    total_rouge1, total_rouge2, total_rougeL = 0, 0, 0
    num_comparisons = min(len(machine_texts), len(human_texts))

    for i in range(num_comparisons):
        machine_text = machine_texts[i]
        human_text = human_texts[i]

        # Calculate ROUGE scores for the current segment
        scores = scorer.score(human_text, machine_text)

        # Print ROUGE scores for the current segment
        print(f"Comparison {i+1}:")
        print(f"Machine Text: {machine_text}")
        print(f"Human Text: {human_text}")
        print(f"ROUGE-1: {scores['rouge1'].fmeasure:.4f}")
        print(f"ROUGE-2: {scores['rouge2'].fmeasure:.4f}")
        print(f"ROUGE-L: {scores['rougeL'].fmeasure:.4f}")
        print("-" * 40)

        # Accumulate ROUGE scores
        total_rouge1 += scores['rouge1'].fmeasure
        total_rouge2 += scores['rouge2'].fmeasure
        total_rougeL += scores['rougeL'].fmeasure

    # Calculate and print average ROUGE scores
    avg_rouge1 = total_rouge1 / num_comparisons
    avg_rouge2 = total_rouge2 / num_comparisons
    avg_rougeL = total_rougeL / num_comparisons

    print("Overall Average ROUGE Scores:")
    print(f"Average ROUGE-1: {avg_rouge1:.4f}")
    print(f"Average ROUGE-2: {avg_rouge2:.4f}")
    print(f"Average ROUGE-L: {avg_rougeL:.4f}")

# Example usage
machine_texts, machine_speakers = read_csv('/content/drive/MyDrive/output.csv')
human_texts, human_speakers = read_csv('/content/drive/MyDrive/human_transcription_samplecall1.csv')#change to the compared human_transcription.

# Calculate ROUGE scores
calculate_rouge(machine_texts, human_texts)

# Compare speakers
compare_speakers(machine_speakers, human_speakers)

Comparison 1:
Machine Text:  Thank you for calling Nissan.
Human Text:  Thank you for calling Nissan.
ROUGE-1: 1.0000
ROUGE-2: 1.0000
ROUGE-L: 1.0000
----------------------------------------
Comparison 2:
Machine Text:  My name is Lauren.
Human Text:  My name is Lauren.
ROUGE-1: 1.0000
ROUGE-2: 1.0000
ROUGE-L: 1.0000
----------------------------------------
Comparison 3:
Machine Text:  Can I have your name?
Human Text:  Can I have your name?
ROUGE-1: 1.0000
ROUGE-2: 1.0000
ROUGE-L: 1.0000
----------------------------------------
Comparison 4:
Machine Text:  My name is John Smith.
Human Text:  My name is John Smith.
ROUGE-1: 1.0000
ROUGE-2: 1.0000
ROUGE-L: 1.0000
----------------------------------------
Comparison 5:
Machine Text:  Thank you, John.
Human Text:  Thank you, John.
ROUGE-1: 1.0000
ROUGE-2: 1.0000
ROUGE-L: 1.0000
----------------------------------------
Comparison 6:
Machine Text:  How can I help you?
Human Text:  How can I help you?
ROUGE-1: 1.0000
ROUGE-2: 1.0000
ROUGE-L: 