In [1]:
import os

os.environ['CUDA_VISIBLE_DEVICES'] = ''

In [2]:
from glob import glob
from sklearn.metrics.pairwise import cosine_similarity
from malaya.graph.pagerank import pagerank
import networkx as nx
import pickle
import numpy as np
from tqdm import tqdm

In [24]:
audios = sorted(glob('/home/husein/ssd2/processed-youtube-v2/*/*.mp3'))
len(audios)

75946

In [25]:
agg_function = np.mean
pkls = sorted(glob('/home/husein/ssd2/processed-youtube-v2/*.pkl'))
len(pkls)

3680

In [14]:
speakers = {}

for pkl in tqdm(pkls):
    with open(pkl, 'rb') as fopen:
        data = pickle.load(fopen)
        
    filename = os.path.split(pkl)[1].replace('.pkl', '')
    
    for result in data:
        if len(result['asr_model'][0]) < 2:
            continue
        speaker = result['classification_model'][1]
        vector = result['classification_model'][0]
        
        speaker_name = f'{filename}-{speaker}'
        
        if speaker_name not in speakers:
            speakers[speaker_name] = vector
        else:
            speakers[speaker_name] = agg_function([vector, speakers[speaker_name]], axis=0)

100%|██████████████████████████████████████████████████████████████████████████████████████████████████| 81/81 [00:00<00:00, 2926.56it/s]


In [26]:
from datasets import Audio

reader = Audio(sampling_rate = 22050)

In [31]:
f = audios[2]
audio = reader.decode_example(reader.encode_example(f))['array']

In [32]:
import IPython.display as ipd
ipd.Audio(audio, rate = 22050)

In [7]:
embeddings = list(speakers.values())
list_speakers = list(speakers.keys())
similar = (cosine_similarity(embeddings) + 1) / 2
similar[np.diag_indices(len(similar))] = 0.0

In [8]:
import scipy as sp
import scipy.sparse as sprs
import scipy.spatial
import scipy.sparse.linalg
from scipy import sparse

G = sparse.csr_matrix(similar)
A = G
n, _ = A.shape
r = sp.asarray(A.sum(axis=1)).reshape(-1)
k = r.nonzero()[0]
D_1 = sprs.csr_matrix((1 / r[k], (k, k)), shape=(n, n))

In [9]:
personalize = sp.ones(n)
personalize = personalize.reshape(n, 1)
s = (personalize / personalize.sum()) * n
I = sprs.eye(n)

In [12]:
p = 0.85
x = sprs.linalg.gmres((I - p * A.T @ D_1), s)

In [19]:
scores = (x[0] / x[0].sum())
ranked = sorted(
    [
        (scores[i], s, i)
        for i, s in enumerate(list_speakers)
    ],
    reverse=False,
)
sorted_speakers = [r[1] for r in ranked]

In [22]:
similarity_threshold = 0.8

In [23]:
G = nx.DiGraph()
G.add_nodes_from(list_speakers)

for speaker in tqdm(sorted_speakers):
    embeddings = list(speakers.values())
    list_speakers = list(speakers.keys())
    similar = (cosine_similarity(embeddings) + 1) / 2
    similar[np.diag_indices(len(similar))] = 0.0
    s = similar[list_speakers.index(speaker)]

    where = np.where(s >= similarity_threshold)[0]
    if len(where):
        argsort = (np.argsort(s)[::-1]).tolist()
        argsort = [a for a in argsort if a in where]
        speakers[list_speakers[argsort[0]]] = np.mean(
            [speakers[speaker], speakers[list_speakers[argsort[0]]]], axis=0)
        speakers.pop(speaker, None)

        G.add_edge(speaker, list_speakers[argsort[0]])

 83%|███████████████████████████████████████████████████████████████████████████████▋                | 7723/9304 [34:07<03:44,  7.04it/s]IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)



In [24]:
mapping = {}
for speaker in tqdm(sorted_speakers):
    traversed = list(nx.dfs_edges(G, source=speaker))
    if len(traversed):
        new_label = traversed[-1][-1]
    else:
        new_label = speaker
    
    mapping[speaker] = new_label

100%|████████████████████████████████████████████████████████████████████████████████████████████| 9304/9304 [00:00<00:00, 143787.05it/s]


In [25]:
mapping

{'Teka_Lagu_Ini_-_OKLETSGO_EP9-HHObXWEsYso-speaker 18': 'Teka_Lagu_Ini_-_OKLETSGO_EP9-HHObXWEsYso-speaker 18',
 'Teka_Lagu_CNY_Versi_Amat_Larbsib___SEISMIK_Challenge-1Bns0c5G15I-speaker 2': 'Teka_Lagu_CNY_Versi_Amat_Larbsib___SEISMIK_Challenge-1Bns0c5G15I-speaker 2',
 'Episod_Baru_Upin_&_Ipin_Musim_15_-_Juara_Karaoke-d4jDsqvbeHI-speaker 0': 'Episod_Baru_Upin_&_Ipin_Musim_15_-_Juara_Karaoke-d4jDsqvbeHI-speaker 0',
 "The_Fashion_Weak_Podcast_Ep_22_-_Why_Fashion_School_Didn't_Work_Out_For_Zee_Avi-woDYSE37_rI-speaker 2": "The_Fashion_Weak_Podcast_Ep_22_-_Why_Fashion_School_Didn't_Work_Out_For_Zee_Avi-woDYSE37_rI-speaker 2",
 'The_Fashion_Weak_Podcast_EP_41_-_Investment_Tips_For_Chee_Seng_With_Fuad_&_Shaq_From_Kyoto_Protocol-eWNQhDNoSsY-speaker 10': 'The_Fashion_Weak_Podcast_EP_41_-_Investment_Tips_For_Chee_Seng_With_Fuad_&_Shaq_From_Kyoto_Protocol-eWNQhDNoSsY-speaker 10',
 'Joke_Pakcik_Pakcik_-_OKLETSGO_EP5-HaNtQDSnlEU-speaker 8': 'Joke_Pakcik_Pakcik_-_OKLETSGO_EP5-HaNtQDSnlEU-speaker 8',


In [26]:
len(set(mapping.values()))

4193

In [27]:
len(set(mapping.keys()))

9304

In [28]:
import json

with open('mapping-youtube-speakers-80.json', 'w') as fopen:
    json.dump(mapping, fopen)