In [1]:
import os
from tqdm import tqdm
import pandas as pd
import librosa
import whisper
from pyannote.audio import Inference
from sklearn.cluster import DBSCAN
from pyannote.audio import Model

In [2]:
import torch
torch.backends.cuda.matmul.allow_tf32 = True
torch.backends.cudnn.allow_tf32 = True

In [3]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [4]:
openunmix = torch.hub.load('sigsep/open-unmix-pytorch', 'umxhq', device=device)
transcription_model = whisper.load_model("medium")
embeddings_model = Model.from_pretrained("pyannote/embedding", use_auth_token="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")
inference = Inference(embeddings_model, window="whole", device=device)

Using cache found in C:\Users\wikto/.cache\torch\hub\sigsep_open-unmix-pytorch_master
  checkpoint = torch.load(fp, map_location=device)
c:\Studia\.conda\Lib\site-packages\pytorch_lightning\utilities\migration\migration.py:208: You have multiple `ModelCheckpoint` callback states in this checkpoint, but we found state keys that would end up colliding with each other after an upgrade, which means we can't differentiate which of your checkpoint callbacks needs which states. At least one of your `ModelCheckpoint` callbacks will not be able to reload the state.
Lightning automatically upgraded your loaded checkpoint from v1.2.7 to v2.4.0. To apply the upgrade to your files permanently, run `python -m pytorch_lightning.utilities.upgrade_checkpoint C:\Users\wikto\.cache\torch\pyannote\models--pyannote--embedding\snapshots\4db4899737a38b2d618bbd74350915aa10293cb2\pytorch_model.bin`
Lightning automatically upgraded your loaded checkpoint from v1.2.7 to v2.4.0. To apply the upgrade to your files

Model was trained with pyannote.audio 0.0.1, yours is 3.3.2. Bad things might happen unless you revert pyannote.audio to 0.x.
Model was trained with torch 1.8.1+cu102, yours is 2.5.1+cu124. Bad things might happen unless you revert torch to 1.x.
Model was trained with pyannote.audio 0.0.1, yours is 3.3.2. Bad things might happen unless you revert pyannote.audio to 0.x.
Model was trained with torch 1.8.1+cu102, yours is 2.5.1+cu124. Bad things might happen unless you revert torch to 1.x.


c:\Studia\.conda\Lib\site-packages\pytorch_lightning\core\saving.py:195: Found keys that are not in the model state dict but in the checkpoint: ['loss_func.W']


In [5]:
def preprocess(file_folder: str):
    files = [f'{file_folder}/{f}' for f in os.listdir(file_folder) if f.endswith('.wav')]
    
    voices, backgrounds = [], []
    data = []
    for file in tqdm(files, desc='Separacja ścieżek audio i transkrypcja'):
        wav, sr = librosa.load(file, sr=openunmix.sample_rate, mono=True)
        wav = torch.Tensor(wav).unsqueeze(0).unsqueeze(0).repeat(1, 2, 1).to(device)
        separated = openunmix(wav).squeeze(0).cpu().detach()
        voice, background = separated[0, :, :], separated[1:, :, :].sum(0)
        voices.append(voice)
        backgrounds.append(background)
        voice = voice[0].numpy()
        voice_resampled = librosa.resample(voice, orig_sr=sr.item(), target_sr=16000)

        result = transcription_model.transcribe(voice_resampled, language="en", fp16=True, word_timestamps=True)

        for segment in result['segments']:
            start = segment['start']
            end = segment['end']
            fragment = wav[:, :, int(start * sr):int(end * sr)]
            data.append([file, start, end, segment['text'], fragment])

    data = pd.DataFrame(data, columns=['file', 'start', 'end', 'text', 'wav'])
    embeddings = []
    for wav in tqdm(data['wav'].to_list(), desc='liczenie embeddingów mówcy'):
        embedding = inference({"waveform": torch.tensor(wav)[0], "sample_rate": sr})
        embeddings.append(embedding)

    dbscan = DBSCAN(eps=0.75, min_samples=1, metric='cosine')
    labels = dbscan.fit_predict(embeddings)

    data['speaker'] = labels

    return data      
        

In [6]:
data = preprocess("test_data")

Separacja ścieżek audio i transkrypcja: 100%|██████████| 5/5 [00:25<00:00,  5.13s/it]
  embedding = inference({"waveform": torch.tensor(wav)[0], "sample_rate": sr})
It can be re-enabled by calling
   >>> import torch
   >>> torch.backends.cuda.matmul.allow_tf32 = True
   >>> torch.backends.cudnn.allow_tf32 = True
See https://github.com/pyannote/pyannote-audio/issues/1370 for more details.

liczenie embeddingów mówcy: 100%|██████████| 31/31 [00:00<00:00, 87.27it/s]


In [7]:
import requests

In [8]:
API_KEY = "ceMTC5HC1gvae9VTEZ_oWcWXQaTEcJos4j-Y8VHpfbu4BAhP"
MODELS_ENDPOINT = "https://services.clarin-pl.eu/api/v1/oapi/models"
COMPLETIONS_ENDPOINT = "https://services.clarin-pl.eu/api/v1/oapi/chat/completions"
CLARIN_BASE_URL = "https://services.clarin-pl.eu/api/v1/oapi"
MODEL_ID = 'llama3.1'

In [9]:
def prompt_chat(model_id, sentence):
    url = COMPLETIONS_ENDPOINT
    headers = {"Authorization": f"Bearer {API_KEY}", "Content-Type": "application/json"}
    messages = []
    messages.append({"role": "user", "content": f"Translate to polish and don't add anything else: '{sentence}'"})
    data = {
        "model": model_id,
        "messages": messages
        #"max_tokens": max_tokens
    }
    response = requests.post(url, json=data, headers=headers)
    if response.status_code == 200:
        return response.json()["choices"][0]["message"]["content"]
    else:
        raise Exception(f"CLARIN API Error: {response.status_code} - {response.text}")

In [10]:
data['translated_text'] = data['text'].apply(lambda x: prompt_chat(MODEL_ID, x))
data

Unnamed: 0,file,start,end,text,wav,speaker,translated_text
0,test_data/witcher_en_2.wav,0.0,2.58,"Anything to say for yourself, young lady?","[[[tensor(0.0622, device='cuda:0'), tensor(0.0...",0,Czy masz coś do powiedzenia na swoją obronę mł...
1,test_data/witcher_en_2.wav,5.06,7.1,"I'm very sorry, Uncle Vesemir.","[[[tensor(-0.0026, device='cuda:0'), tensor(-0...",1,"' Bardzo przepraszam, wujku Wesemirze.'"
2,test_data/witcher_en_2.wav,7.44,9.88,"Young blood craves action, I understand that.","[[[tensor(-0.0013, device='cuda:0'), tensor(-0...",0,"'Młoda krew pragnie działania, rozumiem to.'"
3,test_data/witcher_en_2.wav,10.32,14.48,"But when you fight a beast, knowledge counts ...","[[[tensor(0.0233, device='cuda:0'), tensor(0.0...",0,"Ale gdy walcysz z bestią, wiedza liczy się tyl..."
4,test_data/witcher_en_2.wav,15.0,19.1,"At the very least, you ought to be able to te...","[[[tensor(0.0022, device='cuda:0'), tensor(0.0...",0,'Przynajmniej powinieneś potrafić odróżnić gro...
5,test_data/witcher_en_2.wav,19.48,23.92,"By markings, like unto the Panthera Tigris th...","[[[tensor(-0.0297, device='cuda:0'), tensor(-0...",1,"Przez znaki, podobne jak u tygrysa z rodzaju P..."
6,test_data/witcher_en_2.wav,23.92,26.32,And by the sickly paleness of its visage.,"[[[tensor(0.0262, device='cuda:0'), tensor(0.0...",1,'O chorobliwej bladości jego twarzy'.
7,test_data/witcher_en_2.wav,27.76,31.02,Hmm. So you did read the chapter.,"[[[tensor(0.0103, device='cuda:0'), tensor(0.0...",0,"Ciekawe, wygląda na to, że przeczytałeś ten ro..."
8,test_data/witcher_en_2.wav,31.8,33.92,"Still, you should have asked if...","[[[tensor(0.0417, device='cuda:0'), tensor(0.0...",0,W każdym razie powinieneś spytać się czy...
9,test_data/witcher_en_2.wav,33.92,35.98,"But you were asleep, Uncle Vesemir.","[[[tensor(0.0235, device='cuda:0'), tensor(0.0...",1,"Ale śpisz, wuju Vesimirze."
