In [1]:
# !wget https://gist.githubusercontent.com/huseinzol05/98974ae8c6c7a65d4bc0af9f5003786a/raw/2e06e71ef7349a57bc58cc9913ae6bae1f9f8447/mp.py

In [2]:
from scipy.spatial import KDTree
from datasketch import MinHash, MinHashLSH
from glob import glob
from tqdm import tqdm
import numpy as np
import mp
import pandas as pd

In [3]:
files = glob('embedding-podcast/*.npy')
len(files)

75965

In [4]:
embeddings = []
for f in tqdm(files):
    embeddings.append(np.load(f))
embeddings = np.array(embeddings)

100%|████████████████████████████████████████████████████████████████████████████████| 75965/75965 [00:01<00:00, 45752.78it/s]


In [5]:
embeddings.shape

(75965, 192)

In [6]:
from sklearn.preprocessing import normalize

def deduplicate_embeddings(embeddings, similarity_threshold=0.9):
    """
    Deduplicate embeddings based on cosine similarity threshold.
    
    Args:
        embeddings: numpy array of shape [N, dim] where N is number of embeddings
        similarity_threshold: float between 0 and 1, threshold for considering embeddings as duplicates
    
    Returns:
        unique_indices: indices of unique embeddings
        duplicate_groups: list of lists containing indices of similar embeddings
    """
    # Normalize embeddings for cosine similarity
    embeddings = normalize(embeddings)
    N = embeddings.shape[0]
    
    # Track which embeddings have been marked as duplicates
    is_duplicate = np.zeros(N, dtype=bool)
    duplicate_groups = []
    unique_indices = []
    
    # Process embeddings in batches for memory efficiency
    batch_size = 300000
    
    for i in tqdm(range(N)):
        if is_duplicate[i]:
            continue
            
        # Calculate similarities for current embedding with remaining embeddings
        start_idx = i + 1
        similar_indices = [i]
        
        while start_idx < N:
            end_idx = min(start_idx + batch_size, N)
            batch_similarities = embeddings[i:i+1] @ embeddings[start_idx:end_idx].T
            
            # Find similar embeddings in batch
            batch_similar = np.where(batch_similarities[0] >= similarity_threshold)[0]
            batch_similar_global_idx = batch_similar + start_idx
            
            # Filter out already marked duplicates
            batch_similar_global_idx = batch_similar_global_idx[~is_duplicate[batch_similar_global_idx]]
            
            if len(batch_similar_global_idx) > 0:
                similar_indices.extend(batch_similar_global_idx.tolist())
                is_duplicate[batch_similar_global_idx] = True
                
            start_idx = end_idx
            
        if len(similar_indices) > 1:
            duplicate_groups.append(similar_indices)
        else:
            unique_indices.append(i)
            
    return np.array(unique_indices), duplicate_groups

In [7]:
unique_indices, duplicate_groups = deduplicate_embeddings(embeddings, similarity_threshold=0.95)

100%|█████████████████████████████████████████████████████████████████████████████████| 75965/75965 [00:36<00:00, 2056.87it/s]


In [13]:
import json

with open('deduped-podcasts-95p.json', 'w') as fopen:
    json.dump({
        'unique_indices': unique_indices.tolist(),
        'duplicate_groups': duplicate_groups,
    }, fopen)

In [16]:
from huggingface_hub import HfApi
api = HfApi()
api.upload_file(
    path_or_fileobj="filtered-podcast.parquet",
    path_in_repo="malaysian-podcasts/filtered-podcast.parquet",
    repo_id="malaysia-ai/dedup-malaysian-speakers",
    repo_type="dataset",
)

filtered-podcast.parquet:   0%|          | 0.00/7.70M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/datasets/malaysia-ai/dedup-malaysian-speakers/commit/d2f9918445abf9d42a6277b1a593a41b9bffd467', commit_message='Upload malaysian-podcasts/filtered-podcast.parquet with huggingface_hub', commit_description='', oid='d2f9918445abf9d42a6277b1a593a41b9bffd467', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/malaysia-ai/dedup-malaysian-speakers', endpoint='https://huggingface.co', repo_type='dataset', repo_id='malaysia-ai/dedup-malaysian-speakers'), pr_revision=None, pr_num=None)

In [18]:
df = pd.read_parquet('filtered-podcast.parquet')
df.shape

(75965, 2)

In [21]:
selected = []
for i in tqdm(unique_indices):
    selected.append(df.iloc[i].to_dict())
    
len(selected)

100%|████████████████████████████████████████████████████████████████████████████████| 73384/73384 [00:02<00:00, 26655.61it/s]


73384

In [22]:
selected[0]

{'audio': '/home/husein/ssd4/malaysian-podcast_processed/Super. Sunday： Brand Jahat Local [hQkCidjHoVM]/Super. Sunday： Brand Jahat Local [hQkCidjHoVM]_1.mp3',
 'transcription': 'Ah, benda tu kalau jadi, memang, satu eksperimen yang besar, melibatkan kos yang besar, tapi, impact kita tak tahu, kita just nak buat satu benda gila lah, Voltron dengan Super Sunday. Insya Allah, tahun ni kalau sempat lah.'}

In [33]:
len(set([os.path.split(s['audio'])[1] for s in selected])), len(selected)

(73384, 73384)

In [24]:
!mkdir dedup-podcasts

mkdir: cannot create directory ‘dedup-podcasts’: File exists


In [34]:
import shutil
import os
# shutil.copyfile(src, dst)

def loop(rows):
    rows, _ = rows
    for r in tqdm(rows):
        f = os.path.split(r['audio'])[1]
        new_f = os.path.join('dedup-podcasts', f)
        shutil.copyfile(r['audio'], new_f)

In [36]:
mp.multiprocessing(selected, loop, cores = 10, returned = False)

100%|███████████████████████████████████████████████████████████████████████████████████| 7338/7338 [00:02<00:00, 2568.78it/s]
100%|█████████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:00<00:00, 2799.47it/s]
100%|███████████████████████████████████████████████████████████████████████████████████| 7338/7338 [00:04<00:00, 1801.94it/s]
100%|███████████████████████████████████████████████████████████████████████████████████| 7338/7338 [00:04<00:00, 1636.16it/s]
100%|███████████████████████████████████████████████████████████████████████████████████| 7338/7338 [00:05<00:00, 1390.30it/s]
100%|███████████████████████████████████████████████████████████████████████████████████| 7338/7338 [00:05<00:00, 1467.54it/s]
100%|███████████████████████████████████████████████████████████████████████████████████| 7338/7338 [00:05<00:00, 1433.32it/s]
100%|███████████████████████████████████████████████████████████████████████████████████| 7338/7338 [00:04<00:0

In [37]:
!du -hs dedup-podcasts

2.5G	dedup-podcasts


In [38]:
for i in range(len(selected)):
    f = os.path.split(selected[i]['audio'])[1]
    new_f = os.path.join('dedup-podcasts', f)
    selected[i]['audio'] = new_f

In [40]:
selected[1]

{'audio': 'dedup-podcasts/Macamana Pramugari Handle Penumpang Gatal -  Sabreena Ibrahim  (Bhg 1) [bCjrGP8tYOc]_6.mp3',
 'transcription': 'Okay, I sebenarnya macam ni. Masa I lapan belas tahun, I pergi interview, masa tu dekat Melaka. My mum hantarlah ni. My mum dengan my dad kita pergi Melaka.'}

In [41]:
import IPython.display as ipd
ipd.Audio(selected[1]['audio'])

In [43]:
pd.DataFrame(selected).to_parquet('dedup-malaysian-podcasts.parquet')

In [44]:
api.upload_file(
    path_or_fileobj="dedup-malaysian-podcasts.parquet",
    path_in_repo="data/dedup_malaysian_podcasts-00000-of-00001.parquet",
    repo_id="malaysia-ai/dedup-malaysian-speakers",
    repo_type="dataset",
)

dedup-malaysian-podcasts.parquet:   0%|          | 0.00/7.01M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/datasets/malaysia-ai/dedup-malaysian-speakers/commit/2201c2377f499aa02e245db52849843fd157c1fc', commit_message='Upload data/dedup_malaysian_podcasts-00000-of-00001.parquet with huggingface_hub', commit_description='', oid='2201c2377f499aa02e245db52849843fd157c1fc', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/malaysia-ai/dedup-malaysian-speakers', endpoint='https://huggingface.co', repo_type='dataset', repo_id='malaysia-ai/dedup-malaysian-speakers'), pr_revision=None, pr_num=None)

In [45]:
!zip -rq dedup-podcasts.zip dedup-podcasts

In [46]:
api.upload_file(
    path_or_fileobj="dedup-podcasts.zip",
    path_in_repo="dedup-podcasts.zip",
    repo_id="malaysia-ai/dedup-malaysian-speakers",
    repo_type="dataset",
)

dedup-podcasts.zip:   0%|          | 0.00/2.46G [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/datasets/malaysia-ai/dedup-malaysian-speakers/commit/ed4fa7fdd4c43de2a01b5cfa321f492821246e82', commit_message='Upload dedup-podcasts.zip with huggingface_hub', commit_description='', oid='ed4fa7fdd4c43de2a01b5cfa321f492821246e82', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/malaysia-ai/dedup-malaysian-speakers', endpoint='https://huggingface.co', repo_type='dataset', repo_id='malaysia-ai/dedup-malaysian-speakers'), pr_revision=None, pr_num=None)