In [34]:
from pyannote.audio import Pipeline
from transformers import pipeline as transcribe_pipeline
import torchaudio
from huggingface_hub import login
from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline
import torch
from pydub import AudioSegment
import librosa
import pandas as pd

from config import HF_TOKEN

In [3]:
login(HF_TOKEN)

Token will not been saved to git credential helper. Pass `add_to_git_credential=True` if you want to set the git credential as well.
Token is valid (permission: fineGrained).
Your token has been saved to /Users/michael/.cache/huggingface/token
Login successful


In [4]:
# Initialize pyannote pipeline for speaker diarization
print("loading pyannote pipeline")
diarization_pipeline = Pipeline.from_pretrained("pyannote/speaker-diarization-3.1")

# Initialize Whisper model for ASR (automatic speech recognition)
print("loading whisper model")
whisper_model = transcribe_pipeline("automatic-speech-recognition", model="openai/whisper-base")

# Path to your audio file
audio_file = "09-07-2017 audio_6 APCNV-2016-565.mp3"

# Perform diarization
print("performing diarization")
diarization = diarization_pipeline(audio_file)


loading pyannote pipeline


Downloading config.yaml:   0%|          | 0.00/469 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/5.91M [00:00<?, ?B/s]

Downloading config.yaml:   0%|          | 0.00/399 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/26.6M [00:00<?, ?B/s]

Downloading config.yaml:   0%|          | 0.00/221 [00:00<?, ?B/s]

loading whisper model


Downloading config.json:   0%|          | 0.00/1.98k [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/290M [00:00<?, ?B/s]

Downloading generation_config.json:   0%|          | 0.00/3.81k [00:00<?, ?B/s]

Downloading tokenizer_config.json:   0%|          | 0.00/283k [00:00<?, ?B/s]

Downloading vocab.json:   0%|          | 0.00/836k [00:00<?, ?B/s]

Downloading tokenizer.json:   0%|          | 0.00/2.48M [00:00<?, ?B/s]

Downloading merges.txt:   0%|          | 0.00/494k [00:00<?, ?B/s]

Downloading normalizer.json:   0%|          | 0.00/52.7k [00:00<?, ?B/s]

Downloading added_tokens.json:   0%|          | 0.00/34.6k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/2.19k [00:00<?, ?B/s]

Downloading (…)rocessor_config.json:   0%|          | 0.00/185k [00:00<?, ?B/s]

performing diarization


In [29]:
device = "cuda:0" if torch.cuda.is_available() else "cpu"
torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
# model_id = "openai/whisper-large-v3"
model_id = "openai/whisper-base"

model = AutoModelForSpeechSeq2Seq.from_pretrained(
    model_id, torch_dtype=torch_dtype, use_safetensors=True
)
model.to(device)

processor = AutoProcessor.from_pretrained(model_id)

pipe = pipeline(
    "automatic-speech-recognition",
    model=model,
    tokenizer=processor.tokenizer,
    feature_extractor=processor.feature_extractor,
    max_new_tokens=128,
    chunk_length_s=30,
    batch_size=16,
    return_timestamps=True,
    torch_dtype=torch_dtype,
    device=device,
)


In [36]:

# Load the audio file
print("loading audio file")
# waveform, sample_rate = torchaudio.load(audio_file)

# audio = AudioSegment.from_mp3(audio_file)
waveform, sample_rate = librosa.load(audio_file, sr=16000)  # Load and resample to 16kHz



# Process each segment and transcribe
transcriptions = []
print("processing segments")

previous_speaker = None
current_speaker_text = ""
speaker_ls = []
speaker_text_ls = []
for segment in diarization.itertracks(yield_label=True):
    start_time = segment[0].start
    end_time = segment[0].end
    speaker_label = segment[2]
    
    # Extract the segment from the waveform
    # start_sample = int(start_time * sample_rate)
    # end_sample = int(end_time * sample_rate)
    # segment_waveform = waveform[:, start_sample:end_sample]
    # segment_waveform = waveform[start_time:end_time]
        # Convert time to sample indices
    start_sample = int(start_time * sample_rate)
    end_sample = int(end_time * sample_rate)
    
    # Ensure indices are within bounds
    start_sample = max(0, start_sample)
    end_sample = min(len(waveform), end_sample)
    
    # Extract the segment from the waveform
    segment_waveform = waveform[start_sample:end_sample]


    transcription = pipe(segment_waveform)
    # transcription = result["text"]

    # Convert the segment to text using Whisper
    # transcription = whisper_model(segment_waveform.squeeze().numpy(), return_timestamps=False)
    current_speaker = speaker_label.split('_')[-1]
    
    # Handle the first speaker
    if previous_speaker is None:
        current_speaker_text = transcription['text']


    elif current_speaker != previous_speaker:
        # Append the previous speaker's text if it exists
        if current_speaker_text:
            speaker_ls.append(int(previous_speaker.split('_')[-1]))
            speaker_text_ls.append(current_speaker_text)
            # transcriptions.append(f"Speaker {previous_speaker.split('_')[-1]}: {current_speaker_text}")
        current_speaker_text = transcription['text']
        print(f"Speaker {speaker_label.split('_')[-1]}: {transcription['text']}")
    else:
        current_speaker_text += " " + transcription['text']

    previous_speaker = current_speaker

    # Handle the last speaker after the loop
if current_speaker_text:
    # transcriptions.append(f"Speaker {current_speaker}: {current_speaker_text}")
    speaker_ls.append(int(current_speaker.split('_')[-1]))
    speaker_text_ls.append(current_speaker_text)

# # Print the transcriptions
# for transcription in transcriptions:
#     print(transcription)

loading audio file
processing segments
Speaker 01:  Hello, everyone. My name is Lucerito Martinez. I'm the project planner assigned to the case. The project consists of the demolition of an existing single family home and the subsequent construction of a three-story eight-unit apartment building with a semi-subterranean garage. It will provide a total of 15 parking spaces. The requested entitlements include a zone change from R1-1 to R3-1 to construct the 8-unit apartment building and also include say 15-foot building line removal created by ordinance 1-29661. The recommendation is to approve the categorical exemption class 32, the zone change from R1-1 to TQR3-1.
Speaker 09:  That's the report. Do we have any questions from the Commission for the Department on the report?
Speaker 04:  No questions.
Speaker 09:  Okay.
Speaker 04:  you
Speaker 09:  Thank you.
Speaker 05:  you


There was an error while processing timestamps, we haven't found a timestamp as last token. Was WhisperTimeStampLogitsProcessor used?


Speaker 00:  you
Speaker 04:  I'm sending this one.
Speaker 00:  you
Speaker 09:  It's a general comment request.
Speaker 05:  If there's a comment card...
Speaker 09:  i don't have i don't have any cards for against the side
Speaker 04:  comment on item six.
Speaker 07:  Yeah.
Speaker 04:  Are we?
Speaker 07:  Yeah, this is where NACA before.
Speaker 09:  I'm just happy.
Speaker 03:  Good.
Speaker 09:  By the way
Speaker 08:  yes i'd like to know what is the name of the puppet well bully bull bully bull but the ball everybody goes by must
Speaker 09:  But I'm going to be back.
Speaker 08:  by their name.
Speaker 09:  Thank you.
Speaker 08:  Mr. Puppet. Well, no, but he's known as Mr. Puppet. Okay. Please go ahead. You got one minute.
Speaker 09:  you
Speaker 06:  So on my comment card, they put down general comments about item six. So on the corner on the top left, is that right?
Speaker 08:  Web.
Speaker 06:  There's an item number that's listed.
Speaker 09:  Thank you.
Speaker 03:  

In [38]:
data = {
    "speaker": speaker_ls,
    "text": speaker_text_ls
}

df = pd.DataFrame(data)
df.to_csv(audio_file.replace('.mp3', '') + ".csv", index=False)
df

Unnamed: 0,speaker,text
0,9,Next item on the agenda is some It's the cas...
1,1,"Hello, everyone. My name is Lucerito Martinez..."
2,9,That's the report. Do we have any questions f...
3,4,No questions.
4,9,Okay. Hearing none and there's no other spea...
...,...,...
60,4,Second.
61,7,"Okay, and we are adjourned and we're out. Ple..."
62,9,We're out. the way the main
63,7,He can come to the end.
