In [1]:
from pyannote.audio import Pipeline
from transformers import pipeline as transcribe_pipeline
import torchaudio
from huggingface_hub import login
from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline
import torch
from pydub import AudioSegment
import librosa
import pandas as pd

from config import HF_TOKEN

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
login(HF_TOKEN)

The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: fineGrained).
Your token has been saved to /Users/mpham/.cache/huggingface/token
Login successful


In [18]:
# Initialize pyannote pipeline for speaker diarization
print("loading pyannote pipeline")
diarization_pipeline = Pipeline.from_pretrained("pyannote/speaker-diarization-3.1")

# Initialize Whisper model for ASR (automatic speech recognition)
print("loading whisper model")
whisper_model = transcribe_pipeline("automatic-speech-recognition", model="openai/whisper-base")

# Path to your audio file
# Convert MP3 to WAV
print("converting to .wav file")
mp3_file = "07_CPC_2022_6189.mp3"
wav_file = mp3_file.replace(".mp3", ".wav")
audio = AudioSegment.from_mp3(mp3_file)
audio.export(wav_file, format="wav")

# Use the WAV file for processing
audio_file = wav_file

# Perform diarization
print("performing diarization")
diarization = diarization_pipeline(audio_file)


loading pyannote pipeline
loading whisper model


Device set to use cpu


converting to .wav file
performing diarization


In [19]:
device = "cuda:0" if torch.cuda.is_available() else "cpu"
torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
# model_id = "openai/whisper-large-v3"
model_id = "openai/whisper-base"

model = AutoModelForSpeechSeq2Seq.from_pretrained(
    model_id, torch_dtype=torch_dtype, use_safetensors=True
)
model.to(device)

processor = AutoProcessor.from_pretrained(model_id)

pipe = pipeline(
    "automatic-speech-recognition",
    model=model,
    tokenizer=processor.tokenizer,
    feature_extractor=processor.feature_extractor,
    max_new_tokens=128,
    chunk_length_s=30,
    batch_size=16,
    return_timestamps=True,
    torch_dtype=torch_dtype,
    device=device,
)


Device set to use cpu


In [20]:

# Load the audio file
print("loading audio file")
# waveform, sample_rate = torchaudio.load(audio_file)

# audio = AudioSegment.from_mp3(audio_file)
waveform, sample_rate = librosa.load(audio_file, sr=16000)  # Load and resample to 16kHz



# Process each segment and transcribe
transcriptions = []
print("processing segments")

previous_speaker = None
current_speaker_text = ""
speaker_ls = []
speaker_text_ls = []
for segment in diarization.itertracks(yield_label=True):
    start_time = segment[0].start
    end_time = segment[0].end
    speaker_label = segment[2]
    
    # Extract the segment from the waveform
    # start_sample = int(start_time * sample_rate)
    # end_sample = int(end_time * sample_rate)
    # segment_waveform = waveform[:, start_sample:end_sample]
    # segment_waveform = waveform[start_time:end_time]
        # Convert time to sample indices
    start_sample = int(start_time * sample_rate)
    end_sample = int(end_time * sample_rate)
    
    # Ensure indices are within bounds
    start_sample = max(0, start_sample)
    end_sample = min(len(waveform), end_sample)
    
    # Extract the segment from the waveform
    segment_waveform = waveform[start_sample:end_sample]


    transcription = pipe(segment_waveform)
    # transcription = result["text"]

    # Convert the segment to text using Whisper
    # transcription = whisper_model(segment_waveform.squeeze().numpy(), return_timestamps=False)
    current_speaker = speaker_label.split('_')[-1]
    
    # Handle the first speaker
    if previous_speaker is None:
        current_speaker_text = transcription['text']


    elif current_speaker != previous_speaker:
        # Append the previous speaker's text if it exists
        if current_speaker_text:
            speaker_ls.append(int(previous_speaker.split('_')[-1]))
            speaker_text_ls.append(current_speaker_text)
            # transcriptions.append(f"Speaker {previous_speaker.split('_')[-1]}: {current_speaker_text}")
        current_speaker_text = transcription['text']
        print(f"Speaker {speaker_label.split('_')[-1]}: {transcription['text']}")
    else:
        current_speaker_text += " " + transcription['text']

    previous_speaker = current_speaker

    # Handle the last speaker after the loop
if current_speaker_text:
    # transcriptions.append(f"Speaker {current_speaker}: {current_speaker_text}")
    speaker_ls.append(int(current_speaker.split('_')[-1]))
    speaker_text_ls.append(current_speaker_text)

# # Print the transcriptions
# for transcription in transcriptions:
#     print(transcription)

loading audio file
processing segments
Speaker 39:  Good morning commissioners. I'm Kevin Golden, city planner with the Expedited Processing Section and what we have before you is the Mission Road Project.
Speaker 35:  Morning commissioners, Heather Bliemers, the Department of City Planning. As mentioned, we just wanted to get this in the record before your consideration of the project.
Speaker 39:  Thank you.
Speaker 04:  Thank you so much. We will now hear from the applicant. Please approach the podium. And please state your name for the record and speak directly into the microphone. We have Shay Yadden from Lincoln Park Holdings. And I believe Jesse Harris is here.
Speaker 12:  That's correct.
Speaker 04:  Good morning.
Speaker 12:  Good morning.
Speaker 04:  How many minutes will you need?
Speaker 12:  I'm going to request 20 minutes. Please excuse a exceptionally long presentation, but there's a lot that we feel that we want to address with this project. I'll try to be quicker tha

Whisper did not predict an ending timestamp, which can happen if audio is cut off in the middle of a word. Also make sure WhisperTimeStampLogitsProcessor was used during generation.


Speaker 19: 嗨 哎呀呀呀呀呀呀呀呀呀呀呀呀呀呀呀呀呀呀呀呀呀呀呀呀呀呀呀呀呀呀呀呀呀呀呀呀呀呀呀呀呀呀呀呀呀呀呀呀呀呀呀呀呀呀呀呀呀呀呀呀呀呀呀呀呀呀呀呀呀呀呀呀呀呀呀呀呀呀呀呀呀呀呀呀呀呀呀呀呀呀呀呀呀呀呀呀呀呀呀呀呀呀呀呀呀呀呀呀呀呀呀呀呀呀呀呀呀呀呀呀呀呀呀
Speaker 29:  Thank you.
Speaker 19:  you
Speaker 29:  I want to...
Speaker 19:  and
Speaker 29:  Thank you, I'm such a...
Speaker 19:  you
Speaker 29:  you
Speaker 19:  you
Speaker 29:  Actually, I was eligible to decide this.
Speaker 19:  you
Speaker 29:  very small apocalyptic in LA to those popular splatters and species.
Speaker 37:  Thank you, that is your time.
Speaker 09:  Yes, first of all, I have a light on the lens and it has some very active.
Speaker 31:  MBC 뉴스 김
Speaker 09:  Um
Speaker 37:  Thank you. Sergio Garcia, you are not able to amuse. Please proceed.
Speaker 07:  Oh, here you go.
Speaker 37:  Thank you. That is your time.
Speaker 24:  Hello, my name is Lina, I'm a Canadian.
Speaker 19:  I'm sorry, I didn't ask you for it.
Speaker 29:  Thank you.
Speaker 19:  MBC 뉴스 김
Speaker 29:  MBC 뉴스 김
Speaker 37:  Thank you.
Speaker 24:  Yes,

Whisper did not predict an ending timestamp, which can happen if audio is cut off in the middle of a word. Also make sure WhisperTimeStampLogitsProcessor was used during generation.


Speaker 31:  you
Speaker 37:  Thank you. Thank you. That is your time.
Speaker 30:  Thank you.
Speaker 31:  you
Speaker 30:  you
Speaker 37:  Kim, you are not able to amuse, please proceed.
Speaker 31:  MBC 뉴스 김
Speaker 24:  MBC 뉴스 김
Speaker 26:  you
Speaker 24:  MBC 뉴스 김
Speaker 26:  I'm sorry, I'm sorry.
Speaker 00:  you
Speaker 26:  have this level of work that has been very unhappy for us as a scientist for this project, because of new ideas and the core of our resilience that has been through different issues.
Speaker 37:  Thank you.
Speaker 19:  Hi, my name is Cindy Perez. I am a graduate from the State University of New Zealand. I'm a company, some of the third-graders. And I am calling to you, saying that I am it's true of post to this development. I am here to offer you to that institute exchange of Nalisida Extension, and I will be your open-ended online.
Speaker 29:  I'm just going to be doing a best idea as well. I appreciate that you love me, and that the house that you lo

Whisper did not predict an ending timestamp, which can happen if audio is cut off in the middle of a word. Also make sure WhisperTimeStampLogitsProcessor was used during generation.


Speaker 29:  Yes, many of the times we've been using this kind of thing that we've seen in the last minutes, some of the reasons we've been doing this, but I think it's a very, very, I think it's a very, very, very, I would say that the timing is great, so it's a very, very, very, very, very, very, very, very, very, very, very, very, very, very, very, that it's so kind of accurate.
Speaker 25:  you
Speaker 37:  Thank you, that is your time. Dahmah, you are not able to amuse, please proceed.
Speaker 25:  you
Speaker 29:  Thank you.
Speaker 25:  Thank you.
Speaker 29:  I don't know.
Speaker 25:  I'm just trying to find out if I can hold in certain locations. So I did mine, and I did see them in a community.
Speaker 29:  and the work council is an out of request of not to be seen as other folks work communities by enjoying the active and open color, a violent development that I'm not sure if I'm pleased. This project will perpetuate an environment of racism. Please, afford to be an inspir

Whisper did not predict an ending timestamp, which can happen if audio is cut off in the middle of a word. Also make sure WhisperTimeStampLogitsProcessor was used during generation.


Speaker 25:  I think you're saying right now, I'm not sure if that is going to say it. I'm making a kind of talk to the world, all the times, I've been thinking of the universe and say that, saying obviously, along with my neighbors, and so you make sure that I heard from the outside talk to you very many and talk to you guys about that. And I wanted to say, no, but I think I was saying, you're saying, talk to me about the universe, and then we'll see you in the next video.
Speaker 37:  Thank you. Hugo Pacheco, you're not able to unmute. Please proceed.
Speaker 15:  Thank you.
Speaker 37:  Thank you. Community resilient. You're not able to unmute. Please proceed.
Speaker 30:  Hi, thanks.
Speaker 31:  MBC 뉴스 김
Speaker 30:  I'm going to stay like knowledge.


Whisper did not predict an ending timestamp, which can happen if audio is cut off in the middle of a word. Also make sure WhisperTimeStampLogitsProcessor was used during generation.


Speaker 37:  Thank you. That was our last caller, Vice President Joe.
Speaker 04:  Thank you. We have a couple of more speaker cards for in-person speakers here, so I'd like to call up. Kevin Scott.
Speaker 33:  Hello, my name is Kevin Scott. I'm a resident of Northeast LA and I want to express my enthusiastic support for this project, which will add 184 units of housing, including 47 units affordable to very low-income renters.
Speaker 04:  Thank you.
Speaker 01:  Hi, my name is Anthony Daniel. I'm a self of 45 seconds because I don't want to be cut off. So I live in a low income subsidized housing program. And I was at one of the youth that all these people are talking about. Like, oh, they're just going to stay homeless. I have an issue with that because no. I sing with the trans course of Los Angeles as an openly trans activist. I have an issue with the fact that y'all really believe that this is all rumor based. I'm sorry if you live close to there, you should be in support of thi

Whisper did not predict an ending timestamp, which can happen if audio is cut off in the middle of a word. Also make sure WhisperTimeStampLogitsProcessor was used during generation.


Speaker 04:  Yes, this is Caroline Cho. I would also agree that, you know, to Commissioner Newton's point that, you know, we're getting more units here and we're getting more affordable housing. And certainly there is no perfect answer or no perfect solution. But certainly I'm also for the mixed income housing, I believe. It allows for everybody to access various amenities that certain projects may not have. If it were separate and so, certainly, although I understand change is difficult, I think that we'll see.
Speaker 36:  Sissy Alamos for the record, just a friendly reminder, there is a technical modification to adopt with it emotion.
Speaker 04:  Thank you.
Speaker 02:  I'm always like, oh, I gotta do it right now.
Speaker 04:  Karen Mack.
Speaker 02:  I move approval. Do I need to say the case number?
Speaker 04:  Thank you.
Speaker 02:  Thank you.
Speaker 04:  for a second.
Speaker 38:  Mr. Newton, second.
Speaker 36:  to see that we have a first and a second. So see the lemma's 

In [21]:
data = {
    "speaker": speaker_ls,
    "text": speaker_text_ls
}

df = pd.DataFrame(data)
df.to_csv(audio_file.replace('.wav', '') + ".csv", index=False)
df

Unnamed: 0,speaker,text
0,4,"Thank you. With that, we will move on to ite..."
1,39,"Good morning commissioners. I'm Kevin Golden,..."
2,35,"Morning commissioners, Heather Bliemers, the ..."
3,39,Thank you. And now what I will do is proceed...
4,4,Thank you so much. We will now hear from the ...
...,...,...
251,36,Commissioner Lyshay? Yes. Commissioner Zamora?
252,38,you
253,3,Yes.
254,36,Commissioner Vice President Cho? Yes. And the...
