In [None]:
%pip install pyannote.audio

In [None]:
# instantiate the pipeline
from pyannote.audio import Pipeline
from dotenv import load_dotenv
import os
import torch

load_dotenv()

access_token = os.getenv("HUGGINGFACE_TOKEN")

pipeline = Pipeline.from_pretrained(
  "pyannote/speaker-diarization-3.1",
  token=access_token)


print(pipeline)

# CUDA를 사용할 수 있다면 CUDA를 사용하도록 설정 
if torch.cuda.is_available():
    pipeline.to(torch.device("cuda"))
    print("cuda is available")
else:
    print("cuda is not available")


In [None]:
import librosa
import torch

path = "./audio/싼기타_비싼기타.mp3"

y, sample_rate = librosa.load(path, sr=16000, mono=True)
waveform = torch.tensor(y).float().unsqueeze(0)

print("shape:", waveform.shape, "sr:", sample_rate)
print("sec:", waveform.shape[-1] / sample_rate)
print("mean_abs:", waveform.abs().mean().item(), "max_abs:", waveform.abs().max().item())

diarize_out = pipeline({"waveform": waveform, "sample_rate": sample_rate})

# ✅ 여기 한 줄이 핵심 (너 버전에 맞는 Annotation)
ann = diarize_out.speaker_diarization
# (선택) 겹침 발화를 없애고 싶으면:
# ann = diarize_out.exclusive_speaker_diarization

with open("싼기타_비싼기타.rttm", "w", encoding="utf-8") as rttm:
    ann.write_rttm(rttm)


In [None]:
import pandas as pd

rttm_path = "./audio/싼기타_비싼기타.rttm"

df_rttm = pd.read_csv(
    rttm_path,
    sep=" ",
    header = None,
    names=['type', "file", 'chnl', 'start', 'duration', 'C1', 'C2', 'speaker_id', 'C3', 'C4']
)

display(df_rttm)

In [None]:
df_rttm['end'] = df_rttm['start'] + df_rttm['duration']

display(df_rttm)

In [None]:
df_rttm["number"] = None
df_rttm.at[0, 'number'] = 0

display(df_rttm)

In [None]:
for i in range(1, len(df_rttm)):
    if df_rttm.at[i, "speaker_id"] != df_rttm.at[i-1, 'speaker_id']:
        df_rttm.at[i, 'number'] = df_rttm.at[i-1, 'number'] + 1
    else:
        df_rttm.at[i, 'number'] = df_rttm.at[i-1, 'number']

display(df_rttm)

In [None]:
df_rttm_grouped = df_rttm.groupby('number').agg(
    start=pd.NamedAgg(column="start", aggfunc="min"),
    end=pd.NamedAgg(column='end', aggfunc="max"),
    speaker_id=pd.NamedAgg(column="speaker_id", aggfunc="first")
)

display(df_rttm_grouped)

In [16]:
df_rttm_grouped['duration'] = df_rttm_grouped['end'] - df_rttm_grouped['start']
df_rttm_grouped = df_rttm_grouped.reset_index(drop=True)
display(df_rttm_grouped)

Unnamed: 0,start,end,speaker_id,duration
0,0.824,30.169,SPEAKER_01,29.345
1,32.414,42.708,SPEAKER_00,10.294
2,41.695,44.007,SPEAKER_01,2.312
3,45.813,67.059,SPEAKER_00,21.246
4,67.227,82.752,SPEAKER_01,15.525
5,84.659,102.53,SPEAKER_00,17.871
6,103.475,117.498,SPEAKER_01,14.023
7,119.742,138.659,SPEAKER_00,18.917
8,139.334,168.916,SPEAKER_01,29.582
9,170.89,192.238,SPEAKER_00,21.348


In [17]:
df_rttm_grouped.to_csv(
    './audio/싼기타_비싼기타_rttm.csv',
    sep=",",
    index=False
)