In [1]:
import os

os.environ['CUDA_VISIBLE_DEVICES'] = ''

In [2]:
from glob import glob
from sklearn.metrics.pairwise import cosine_similarity
from malaya.graph.pagerank import pagerank
import networkx as nx
import pickle
import numpy as np
from tqdm import tqdm

In [3]:
agg_function = np.mean
pkls = sorted(glob('/home/husein/ssd2/processed-youtube/*.pkl'))
len(pkls)

2541

In [4]:
speakers = {}

for pkl in tqdm(pkls):
    with open(pkl, 'rb') as fopen:
        data = pickle.load(fopen)
        
    filename = os.path.split(pkl)[1].replace('.pkl', '')
    
    for result in data:
        speaker = result['classification_model'][1]
        vector = result['classification_model'][0]
        
        speaker_name = f'{filename}-{speaker}'
        
        if speaker_name not in speakers:
            speakers[speaker_name] = vector
        else:
            speakers[speaker_name] = agg_function([vector, speakers[speaker_name]], axis=0)

100%|██████████████████████████████████████████████████████████████████████████████████████████████| 2541/2541 [00:00<00:00, 2733.13it/s]


In [5]:
len(speakers)

9304

In [6]:
embeddings = list(speakers.values())
list_speakers = list(speakers.keys())
similar = (cosine_similarity(embeddings) + 1) / 2
similar[np.diag_indices(len(similar))] = 0.0

In [7]:
import scipy as sp
import scipy.sparse as sprs
import scipy.spatial
import scipy.sparse.linalg
from scipy import sparse

G = sparse.csr_matrix(similar)
A = G
n, _ = A.shape
r = sp.asarray(A.sum(axis=1)).reshape(-1)
k = r.nonzero()[0]
D_1 = sprs.csr_matrix((1 / r[k], (k, k)), shape=(n, n))

In [8]:
personalize = sp.ones(n)
personalize = personalize.reshape(n, 1)
s = (personalize / personalize.sum()) * n
I = sprs.eye(n)

In [9]:
p = 0.85
x = sprs.linalg.gmres((I - p * A.T @ D_1), s)

In [10]:
scores = (x[0] / x[0].sum())
ranked = sorted(
    [
        (scores[i], s, i)
        for i, s in enumerate(list_speakers)
    ],
    reverse=False,
)
sorted_speakers = [r[1] for r in ranked]

In [11]:
similarity_threshold = 0.7

In [12]:
G = nx.DiGraph()
G.add_nodes_from(list_speakers)

for speaker in tqdm(sorted_speakers):
    embeddings = list(speakers.values())
    list_speakers = list(speakers.keys())
    similar = (cosine_similarity(embeddings) + 1) / 2
    similar[np.diag_indices(len(similar))] = 0.0
    s = similar[list_speakers.index(speaker)]

    where = np.where(s >= similarity_threshold)[0]
    if len(where):
        argsort = (np.argsort(s)[::-1]).tolist()
        argsort = [a for a in argsort if a in where]
        speakers[list_speakers[argsort[0]]] = np.mean(
            [speakers[speaker], speakers[list_speakers[argsort[0]]]], axis=0)
        speakers.pop(speaker, None)

        G.add_edge(speaker, list_speakers[argsort[0]])

 65%|██████████████████████████████████████████████████████████████                                  | 6016/9304 [20:27<03:44, 14.62it/s]IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)

100%|████████████████████████████████████████████████████████████████████████████████████████████████| 9304/9304 [22:21<00:00,  6.94it/s]


In [13]:
mapping = {}
for speaker in tqdm(sorted_speakers):
    traversed = list(nx.dfs_edges(G, source=speaker))
    if len(traversed):
        new_label = traversed[-1][-1]
    else:
        new_label = speaker
    
    mapping[speaker] = new_label

100%|█████████████████████████████████████████████████████████████████████████████████████████████| 9304/9304 [00:00<00:00, 68046.89it/s]


In [14]:
mapping

{'Teka_Lagu_Ini_-_OKLETSGO_EP9-HHObXWEsYso-speaker 18': 'Teka_Lagu_Ini_-_OKLETSGO_EP9-HHObXWEsYso-speaker 18',
 'Teka_Lagu_CNY_Versi_Amat_Larbsib___SEISMIK_Challenge-1Bns0c5G15I-speaker 2': 'Kasut_hujan___Episod_Penuh_Bing___Bing_Bahasa_Melayu-AFGRShwA_-k-speaker 0',
 'Episod_Baru_Upin_&_Ipin_Musim_15_-_Juara_Karaoke-d4jDsqvbeHI-speaker 0': 'Kasut_hujan___Episod_Penuh_Bing___Bing_Bahasa_Melayu-AFGRShwA_-k-speaker 0',
 "The_Fashion_Weak_Podcast_Ep_22_-_Why_Fashion_School_Didn't_Work_Out_For_Zee_Avi-woDYSE37_rI-speaker 2": "The_Fashion_Weak_Podcast_Ep_22_-_Why_Fashion_School_Didn't_Work_Out_For_Zee_Avi-woDYSE37_rI-speaker 2",
 'The_Fashion_Weak_Podcast_EP_41_-_Investment_Tips_For_Chee_Seng_With_Fuad_&_Shaq_From_Kyoto_Protocol-eWNQhDNoSsY-speaker 10': 'The_Fashion_Weak_Podcast_EP_41_-_Investment_Tips_For_Chee_Seng_With_Fuad_&_Shaq_From_Kyoto_Protocol-eWNQhDNoSsY-speaker 10',
 'Joke_Pakcik_Pakcik_-_OKLETSGO_EP5-HaNtQDSnlEU-speaker 8': 'Joke_Pakcik_Pakcik_-_OKLETSGO_EP5-HaNtQDSnlEU-speaker 

In [15]:
len(set(mapping.values()))

523

In [16]:
len(set(mapping.keys()))

9304

In [18]:
import json

with open('mapping-youtube-speakers-70.json', 'w') as fopen:
    json.dump(mapping, fopen)