In [1]:
# !wget https://gist.githubusercontent.com/huseinzol05/98974ae8c6c7a65d4bc0af9f5003786a/raw/2e06e71ef7349a57bc58cc9913ae6bae1f9f8447/mp.py

In [2]:
from scipy.spatial import KDTree
from datasketch import MinHash, MinHashLSH
from glob import glob
from tqdm import tqdm
import numpy as np
import mp
import pandas as pd
import os

In [3]:
files = glob('embedding-others/*.npy')
len(files)

555379

In [4]:
embeddings = []
for f in tqdm(files):
    embeddings.append(np.load(f))
embeddings = np.array(embeddings)

100%|███████████████████████████████████████████████████████████████████████████████| 555379/555379 [01:50<00:00, 5031.70it/s]


In [5]:
from sklearn.preprocessing import normalize

def deduplicate_embeddings(embeddings, similarity_threshold=0.9):
    """
    Deduplicate embeddings based on cosine similarity threshold.
    
    Args:
        embeddings: numpy array of shape [N, dim] where N is number of embeddings
        similarity_threshold: float between 0 and 1, threshold for considering embeddings as duplicates
    
    Returns:
        unique_indices: indices of unique embeddings
        duplicate_groups: list of lists containing indices of similar embeddings
    """
    # Normalize embeddings for cosine similarity
    embeddings = normalize(embeddings)
    N = embeddings.shape[0]
    
    # Track which embeddings have been marked as duplicates
    is_duplicate = np.zeros(N, dtype=bool)
    duplicate_groups = []
    unique_indices = []
    
    # Process embeddings in batches for memory efficiency
    batch_size = 300000
    
    for i in tqdm(range(N)):
        if is_duplicate[i]:
            continue
            
        # Calculate similarities for current embedding with remaining embeddings
        start_idx = i + 1
        similar_indices = [i]
        
        while start_idx < N:
            end_idx = min(start_idx + batch_size, N)
            batch_similarities = embeddings[i:i+1] @ embeddings[start_idx:end_idx].T
            
            # Find similar embeddings in batch
            batch_similar = np.where(batch_similarities[0] >= similarity_threshold)[0]
            batch_similar_global_idx = batch_similar + start_idx
            
            # Filter out already marked duplicates
            batch_similar_global_idx = batch_similar_global_idx[~is_duplicate[batch_similar_global_idx]]
            
            if len(batch_similar_global_idx) > 0:
                similar_indices.extend(batch_similar_global_idx.tolist())
                is_duplicate[batch_similar_global_idx] = True
                
            start_idx = end_idx
            
        if len(similar_indices) > 1:
            duplicate_groups.append(similar_indices)
        else:
            unique_indices.append(i)
            
    return np.array(unique_indices), duplicate_groups

In [6]:
unique_indices, duplicate_groups = deduplicate_embeddings(embeddings, similarity_threshold=0.95)

100%|███████████████████████████████████████████████████████████████████████████████| 555379/555379 [2:00:41<00:00, 76.70it/s]


In [7]:
df = pd.read_parquet('filtered-others.parquet')
df.shape

(555379, 2)

In [8]:
len(unique_indices)

411637

In [9]:
import json

with open('deduped-others-95p.json', 'w') as fopen:
    json.dump({
        'unique_indices': unique_indices.tolist(),
        'duplicate_groups': duplicate_groups,
    }, fopen)

In [11]:
from huggingface_hub import HfApi
api = HfApi()
api.upload_file(
    path_or_fileobj="filtered-others.parquet",
    path_in_repo="malaysian-others/filtered-others.parquet",
    repo_id="malaysia-ai/dedup-malaysian-speakers",
    repo_type="dataset",
)

filtered-others.parquet:   0%|          | 0.00/50.5M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/datasets/malaysia-ai/dedup-Malaysian-Emilia/commit/b90f6847c326fb0ad81043e67ab2e70ac6748bb9', commit_message='Upload malaysian-others/filtered-others.parquet with huggingface_hub', commit_description='', oid='b90f6847c326fb0ad81043e67ab2e70ac6748bb9', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/malaysia-ai/dedup-Malaysian-Emilia', endpoint='https://huggingface.co', repo_type='dataset', repo_id='malaysia-ai/dedup-Malaysian-Emilia'), pr_revision=None, pr_num=None)

In [12]:
from huggingface_hub import HfApi
api = HfApi()
api.upload_file(
    path_or_fileobj="deduped-others-95p.json",
    path_in_repo="malaysia-others/deduped-95p.json",
    repo_id="malaysia-ai/dedup-malaysian-speakers",
    repo_type="dataset",
)

CommitInfo(commit_url='https://huggingface.co/datasets/malaysia-ai/dedup-Malaysian-Emilia/commit/cc44ddfd3e7cbd286fd8b396416bb3da6f5ea8b8', commit_message='Upload malaysia-others/deduped-95p.json with huggingface_hub', commit_description='', oid='cc44ddfd3e7cbd286fd8b396416bb3da6f5ea8b8', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/malaysia-ai/dedup-Malaysian-Emilia', endpoint='https://huggingface.co', repo_type='dataset', repo_id='malaysia-ai/dedup-Malaysian-Emilia'), pr_revision=None, pr_num=None)

In [13]:
selected = []
for i in tqdm(unique_indices):
    selected.append(df.iloc[i].to_dict())
    
len(selected)

100%|██████████████████████████████████████████████████████████████████████████████| 411637/411637 [00:07<00:00, 55720.09it/s]


411637

In [14]:
selected[0]

{'audio': '/home/husein/ssd4/filtered-24k_processed/00295-25/00295-25_0.mp3',
 'transcription': 'As a disclaimer, we do own some shares mentioned in the slides and as another disclaimer, none of what we say should be taken as financial advice. This is purely educational.'}

In [15]:
len(set([os.path.split(s['audio'])[1] for s in selected])), len(selected)

(411637, 411637)

In [16]:
!mkdir dedup-others

In [17]:
import shutil
import os
# shutil.copyfile(src, dst)

def loop(rows):
    rows, _ = rows
    for r in tqdm(rows):
        f = os.path.split(r['audio'])[1]
        new_f = os.path.join('dedup-others', f)
        shutil.copyfile(r['audio'], new_f)

In [18]:
mp.multiprocessing(selected, loop, cores = 10, returned = False)

100%|█████████████████████████████████████████████████████████████████████████████████| 41163/41163 [00:34<00:00, 1181.23it/s]
100%|█████████████████████████████████████████████████████████████████████████████████████████| 7/7 [00:00<00:00, 1011.76it/s]
100%|█████████████████████████████████████████████████████████████████████████████████| 41163/41163 [00:34<00:00, 1189.91it/s]
100%|█████████████████████████████████████████████████████████████████████████████████| 41163/41163 [00:35<00:00, 1163.35it/s]
100%|█████████████████████████████████████████████████████████████████████████████████| 41163/41163 [00:36<00:00, 1136.56it/s]
100%|█████████████████████████████████████████████████████████████████████████████████| 41163/41163 [00:39<00:00, 1040.06it/s]
100%|██████████████████████████████████████████████████████████████████████████████████| 41163/41163 [00:41<00:00, 986.66it/s]
100%|██████████████████████████████████████████████████████████████████████████████████| 41163/41163 [00:41<00:

In [19]:
!du -hs dedup-others

15G	dedup-others


In [20]:
for i in range(len(selected)):
    f = os.path.split(selected[i]['audio'])[1]
    new_f = os.path.join('dedup-others', f)
    selected[i]['audio'] = new_f

In [21]:
selected[1]

{'audio': 'dedup-others/01874-5_3.mp3',
 'transcription': 'Di pihak mahkamah, kes jenayah seksual kanak-kanak diberikan perhatian khusus, menerusi pengendalian prosedur dalam mahkamah khas jenayah seksual kanak-kanak.'}

In [22]:
import IPython.display as ipd
ipd.Audio(selected[1]['audio'])

In [23]:
pd.DataFrame(selected).to_parquet('dedup-malaysian-others.parquet')

In [24]:
api.upload_file(
    path_or_fileobj="dedup-malaysian-others.parquet",
    path_in_repo="data/dedup_malaysian_others-00000-of-00001.parquet",
    repo_id="malaysia-ai/dedup-malaysian-speakers",
    repo_type="dataset",
)

dedup-malaysian-others.parquet:   0%|          | 0.00/37.0M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/datasets/malaysia-ai/dedup-Malaysian-Emilia/commit/e168fb8a5fe34aabe35a85fb63cb77833a716d53', commit_message='Upload data/dedup_malaysian_others-00000-of-00001.parquet with huggingface_hub', commit_description='', oid='e168fb8a5fe34aabe35a85fb63cb77833a716d53', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/malaysia-ai/dedup-Malaysian-Emilia', endpoint='https://huggingface.co', repo_type='dataset', repo_id='malaysia-ai/dedup-Malaysian-Emilia'), pr_revision=None, pr_num=None)