In [1]:
import os
from heapq import heappush, heappushpop, heappop

import numpy as np
import soundfile
import torch

from datasets import load_dataset, interleave_datasets
from huggingface_hub import login
from pyannote.audio import Model, Inference
from tqdm.notebook import tqdm

# hugging faceへのログインとモデル・データセットのロード

必要なもの:

- HuggingFaceのアカウント + token
- https://huggingface.co/pyannote/embedding のユーザー規約への同意
## ログイン

In [3]:
your_hugging_face_token = "your-huggingface-token"
login(token=your_hugging_face_token)

Token will not been saved to git credential helper. Pass `add_to_git_credential=True` if you want to set the git credential as well.
Token is valid.
Your token has been saved to C:\Users\nadare\.cache\huggingface\token
Login successful


## データセットのロード
大規模なデータで確認する際は`small`を`all`に変更してください

In [4]:
ds = load_dataset("reazon-research/reazonspeech", "small", split="train", streaming=True)

## PyAnnoteのモデルのロード

In [5]:
embedding_model = Model.from_pretrained("pyannote/embedding", 
                                        use_auth_token=your_hugging_face_token)
embedding_inference = Inference(embedding_model, window="whole", device="cuda")

# 検索

## クエリの用意

### 音声ファイルから読み込む場合

In [None]:
emb = embedding_inference("your_files_path.wav")
query_embedding = emb / max(np.linalg.norm(emb, ord=2), 1e-9)

### 音声ファイルの平均をとる場合

In [None]:
search_dir = "your_target_dir"
target_extension = ".wav"

for dirpath, dirnames, filenames in os.walk(search_dir):
    for file in filenames:
        if file.endswith(target_extension):
            path = dirpath + "/" + file
            paths.append(path)
embeddings = np.stack([embedding_inference(path) for path in tqdm(paths)], axis=0)
embeddings = (embeddings / np.maximum(1e-9, np.linalg.norm(embeddings, ord=2, axis=-1, keepdims=True))).mean(axis=0)
query_embedding = embeddings / np.maximum(1e-9, np.linalg.norm(embeddings, ord=2, axis=-1, keepdims=True))

### voice-changer-vector-searchのデータから探す場合

In [6]:
!git clone https://github.com/nadare881/voice-changer-vector-search.git
os.chdir("./voice-changer-vector-search")
!git checkout main
!git pull origin
!git reset --hard origin/main
os.chdir("../")

fatal: destination path 'voice-changer-vector-search' already exists and is not an empty directory.


Your branch is up to date with 'origin/main'.


Already on 'main'


Already up to date.


From https://github.com/nadare881/voice-changer-vector-search
   5bfdf10..7f32821  develop    -> origin/develop


HEAD is now at 5326794 Merge pull request #19 from nadare881/develop


In [7]:
import pandas as pd
meta_df = pd.read_csv("voice-changer-vector-search/data/meta.csv")
embeddings = np.load("voice-changer-vector-search/data/embeddings.npy")
meta_df.head()

Unnamed: 0,voice_id,display_name,distribution_type,url,voice_sample,thumbnail,speaker,actor,corpus,language,...,license_type,license_url,terms_link,commercial_use,price,twitter,ex_url_1,model_type,base_model,input_voice
0,0,あみたろのITAコーパス読み上げ音声,voice,https://amitaro.net/voice/corpus-list/ita/,https://www.youtube.com/embed/ksKu6JyLP5I,https://amitaro.net/wp-content/uploads/cfc421f...,あみたろ,あみたろ,ITA,ja,...,original,https://amitaro.net/voice/corpus-list/ita/,https://amitaro.net/voice/voice_rule/,True,0,https://twitter.com/amitaro_utau,https://amitaro.net/,,,
1,1,刻鳴時雨 RVC用学習済みモデル,model,https://huggingface.co/yasyune/Shigure_Tokina_RVC,https://s2.booth.pm/4bbcead3-a3a3-40b7-8364-82...,https://s2.booth.pm/4bbcead3-a3a3-40b7-8364-82...,刻鳴時雨,丸ころ,ITA,ja,...,original,https://bindume-chan.booth.pm/items/3640133,https://bindume-chan.booth.pm/items/3640133,True,0,,,RVC,hubert_basr,https://bindume-chan.booth.pm/items/3640133
2,2,刻鳴時雨ITAコーパス読み上げ音声素材,voice,https://booth.pm/ja/items/3640133,https://s2.booth.pm/4bbcead3-a3a3-40b7-8364-82...,https://s2.booth.pm/4bbcead3-a3a3-40b7-8364-82...,刻鳴時雨,丸ころ,ITA,ja,...,original,https://bindume-chan.booth.pm/items/3640133,https://bindume-chan.booth.pm/items/3640133,True,0,,,,,
3,3,黄琴まひろ　ITAコーパス読み上げ音声【1.00】,voice,https://kikyohiroto1227.wixsite.com/kikoto-uta...,https://www.youtube.com/embed/LT6D4Mx2xgo,https://static.wixstatic.com/media/8a7d85_767d...,黄琴まひろ,黄鏡博人,ITA,ja,...,original,https://kikyohiroto1227.wixsite.com/kikoto-uta...,https://kikyohiroto1227.wixsite.com/kikoto-uta...,True,0,https://twitter.com/KikyoHiloto,,,,
4,4,黄琴海月　ITAコーパス読み上げ音声【4.00】,voice,https://kikyohiroto1227.wixsite.com/kikoto-uta...,https://www.youtube.com/embed/SXnB8b1fKP4,https://static.wixstatic.com/media/8a7d85_5bec...,黄琴海月,黄鏡博人,ITA,ja,...,original,https://kikyohiroto1227.wixsite.com/kikoto-uta...,https://kikyohiroto1227.wixsite.com/kikoto-uta...,True,0,https://twitter.com/KikyoHiloto,,,,


In [8]:
# あみたろ-sanのITAコーパス読み上げ音声を使う
query_embedding = embeddings[0]

## 検索パート

### 検索

In [None]:
from heapq import heappush, heappushpop, heappop
save_size = 10000
result_heap = []
count = 0
with tqdm() as pbar:
    for data in ds:
        audio = data["audio"]
        input_ = {
            "waveform": torch.from_numpy(audio["array"].reshape([1, -1])).to(device="cuda", dtype=torch.float32),
            "sample_rate": audio["sampling_rate"]
        }
        emb = embedding_inference(input_)        
        target_embedding = emb / max(np.linalg.norm(emb, ord=2), 1e-9)
        score = np.dot(query_embedding, target_embedding)
        if len(result_heap) < save_size:
            heappush(result_heap, [(score, count, audio)])
        else:
            heappushpop(result_heap, [(score, count, audio)])
        count += 1
        pbar.update(1)

0it [00:00, ?it/s]

Reading metadata ...: 0it [00:00, ?it/s]

In [9]:
scores = []
audios = []
while len(result_heap):
    s, _, a = heappop(result_heap)[0]
    scores.append(s)
    audio = {"array": a["array"].detach().numpy()[0],
             "sampling_rate": int(a["sampling_rate"][0].detach().numpy())}
    audios.append(audio)

### 結果の確認
スコアが低い順に並んでいるので使えそうな範囲をカット

In [10]:
from IPython.display import Audio
index = -1
# Audio(audios[index]["array"], rate=audios[index]["sampling_rate"], autoplay=True)

### 保存

In [12]:
target_dir = "./output/reazonspeech_like_あみたろ_raw/"
os.makedirs(target_dir, exist_ok=True)

for i, audio in enumerate(audios):
    soundfile.write(target_dir + f"{i}.wav", audio["array"], audio["sampling_rate"])