In [1]:
# !wget https://gist.githubusercontent.com/huseinzol05/98974ae8c6c7a65d4bc0af9f5003786a/raw/2e06e71ef7349a57bc58cc9913ae6bae1f9f8447/mp.py

In [18]:
from scipy.spatial import KDTree
from datasketch import MinHash, MinHashLSH
from glob import glob
from tqdm import tqdm
import numpy as np
import mp
import pandas as pd
import os

In [3]:
files = glob('embedding/*.npy')
len(files)

636921

In [6]:
embeddings = []
for f in tqdm(files):
    embeddings.append(np.load(f))
embeddings = np.array(embeddings)

100%|██████████████████████████████████████████████████████████████████████████████| 636921/636921 [00:56<00:00, 11205.05it/s]


In [7]:
from sklearn.preprocessing import normalize

def deduplicate_embeddings(embeddings, similarity_threshold=0.9):
    """
    Deduplicate embeddings based on cosine similarity threshold.
    
    Args:
        embeddings: numpy array of shape [N, dim] where N is number of embeddings
        similarity_threshold: float between 0 and 1, threshold for considering embeddings as duplicates
    
    Returns:
        unique_indices: indices of unique embeddings
        duplicate_groups: list of lists containing indices of similar embeddings
    """
    # Normalize embeddings for cosine similarity
    embeddings = normalize(embeddings)
    N = embeddings.shape[0]
    
    # Track which embeddings have been marked as duplicates
    is_duplicate = np.zeros(N, dtype=bool)
    duplicate_groups = []
    unique_indices = []
    
    # Process embeddings in batches for memory efficiency
    batch_size = 300000
    
    for i in tqdm(range(N)):
        if is_duplicate[i]:
            continue
            
        # Calculate similarities for current embedding with remaining embeddings
        start_idx = i + 1
        similar_indices = [i]
        
        while start_idx < N:
            end_idx = min(start_idx + batch_size, N)
            batch_similarities = embeddings[i:i+1] @ embeddings[start_idx:end_idx].T
            
            # Find similar embeddings in batch
            batch_similar = np.where(batch_similarities[0] >= similarity_threshold)[0]
            batch_similar_global_idx = batch_similar + start_idx
            
            # Filter out already marked duplicates
            batch_similar_global_idx = batch_similar_global_idx[~is_duplicate[batch_similar_global_idx]]
            
            if len(batch_similar_global_idx) > 0:
                similar_indices.extend(batch_similar_global_idx.tolist())
                is_duplicate[batch_similar_global_idx] = True
                
            start_idx = end_idx
            
        if len(similar_indices) > 1:
            duplicate_groups.append(similar_indices)
        else:
            unique_indices.append(i)
            
    return np.array(unique_indices), duplicate_groups

In [8]:
unique_indices, duplicate_groups = deduplicate_embeddings(embeddings, similarity_threshold=0.95)

100%|██████████████████████████████████████████████████████████████████████████████| 636921/636921 [1:35:20<00:00, 111.33it/s]


In [10]:
df = pd.read_parquet('filtered-politicians.parquet')
df.shape

(636921, 2)

In [11]:
len(unique_indices)

610804

In [12]:
import json

with open('deduped-parliament-95p.json', 'w') as fopen:
    json.dump({
        'unique_indices': unique_indices.tolist(),
        'duplicate_groups': duplicate_groups,
    }, fopen)

In [13]:
from huggingface_hub import HfApi
api = HfApi()
api.upload_file(
    path_or_fileobj="filtered-politicians.parquet",
    path_in_repo="malaysia-parliament/filtered-parliament.parquet",
    repo_id="malaysia-ai/dedup-malaysian-speakers",
    repo_type="dataset",
)

filtered-politicians.parquet:   0%|          | 0.00/57.3M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/datasets/malaysia-ai/dedup-malaysian-speakers/commit/d74818bd5ecf8e244bc5a3bb0d67beb47dc6c0c8', commit_message='Upload malaysia-parliament/filtered-parliament.parquet with huggingface_hub', commit_description='', oid='d74818bd5ecf8e244bc5a3bb0d67beb47dc6c0c8', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/malaysia-ai/dedup-malaysian-speakers', endpoint='https://huggingface.co', repo_type='dataset', repo_id='malaysia-ai/dedup-malaysian-speakers'), pr_revision=None, pr_num=None)

In [14]:
from huggingface_hub import HfApi
api = HfApi()
api.upload_file(
    path_or_fileobj="deduped-parliament-95p.json",
    path_in_repo="malaysia-parliament/deduped-95p.json",
    repo_id="malaysia-ai/dedup-malaysian-speakers",
    repo_type="dataset",
)

CommitInfo(commit_url='https://huggingface.co/datasets/malaysia-ai/dedup-malaysian-speakers/commit/17933af81fd89781b730aa377ae7058473cc6073', commit_message='Upload malaysia-parliament/deduped-95p.json with huggingface_hub', commit_description='', oid='17933af81fd89781b730aa377ae7058473cc6073', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/malaysia-ai/dedup-malaysian-speakers', endpoint='https://huggingface.co', repo_type='dataset', repo_id='malaysia-ai/dedup-malaysian-speakers'), pr_revision=None, pr_num=None)

In [15]:
selected = []
for i in tqdm(unique_indices):
    selected.append(df.iloc[i].to_dict())
    
len(selected)

100%|██████████████████████████████████████████████████████████████████████████████| 610804/610804 [00:07<00:00, 85772.46it/s]


610804

In [16]:
selected[0]

{'audio': '/home/husein/ssd4/parlimen-24k-chunk_processed/parlimen-24k-LANGSUNG ： PERSIDANGAN DEWAN RAKYAT 18 NOV 2021 ｜ SESI KAMAR KHAS [cLmht_XCrOM]_000/parlimen-24k-LANGSUNG ： PERSIDANGAN DEWAN RAKYAT 18 NOV 2021 ｜ SESI KAMAR KHAS [cLmht_XCrOM]_000_2.mp3',
 'transcription': 'Ya Allah, kami sekalian hamba-Mu yang diperkenan menjadi ahli dewan rakyat, menyempurnakan kewajipan yang diamanahkan,'}

In [19]:
len(set([os.path.split(s['audio'])[1] for s in selected])), len(selected)

(610804, 610804)

In [20]:
!mkdir dedup-parliament

In [21]:
import shutil
import os
# shutil.copyfile(src, dst)

def loop(rows):
    rows, _ = rows
    for r in tqdm(rows):
        f = os.path.split(r['audio'])[1]
        new_f = os.path.join('dedup-parliament', f)
        shutil.copyfile(r['audio'], new_f)

In [22]:
mp.multiprocessing(selected, loop, cores = 10, returned = False)

100%|████████████████████████████████████████████████████████████████████████████████| 61080/61080 [00:06<00:00, 10063.19it/s]
100%|████████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:00<00:00, 14027.77it/s]
100%|█████████████████████████████████████████████████████████████████████████████████| 61080/61080 [00:07<00:00, 7956.74it/s]
100%|█████████████████████████████████████████████████████████████████████████████████| 61080/61080 [00:13<00:00, 4492.33it/s]
100%|█████████████████████████████████████████████████████████████████████████████████| 61080/61080 [00:16<00:00, 3667.48it/s]
100%|█████████████████████████████████████████████████████████████████████████████████| 61080/61080 [00:22<00:00, 2728.36it/s]
100%|█████████████████████████████████████████████████████████████████████████████████| 61080/61080 [00:22<00:00, 2739.24it/s]
100%|█████████████████████████████████████████████████████████████████████████████████| 61080/61080 [00:23<00:0

In [23]:
!du -hs dedup-parliament

25G	dedup-parliament


In [25]:
for i in range(len(selected)):
    f = os.path.split(selected[i]['audio'])[1]
    new_f = os.path.join('dedup-parliament', f)
    selected[i]['audio'] = new_f

In [26]:
selected[1]

{'audio': 'dedup-parliament/parlimen-24k-LANGSUNG ： PERSIDANGAN DEWAN RAKYAT 18 NOV 2021 ｜ SESI KAMAR KHAS [cLmht_XCrOM]_000_3.mp3',
 'transcription': 'yang terutama bagi negara Malaysia, bagi rakyatnya sekalian.'}

In [27]:
import IPython.display as ipd
ipd.Audio(selected[1]['audio'])

In [28]:
pd.DataFrame(selected).to_parquet('dedup-malaysia-parliament.parquet')

In [29]:
api.upload_file(
    path_or_fileobj="dedup-malaysia-parliament.parquet",
    path_in_repo="data/dedup_malaysia_parliament-00000-of-00001.parquet",
    repo_id="malaysia-ai/dedup-malaysian-speakers",
    repo_type="dataset",
)

dedup-malaysia-parliament.parquet:   0%|          | 0.00/51.5M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/datasets/malaysia-ai/dedup-malaysian-speakers/commit/b8e40ddcc852bf97222500e5baf5bd0e9c19575c', commit_message='Upload data/dedup_malaysia_parliament-00000-of-00001.parquet with huggingface_hub', commit_description='', oid='b8e40ddcc852bf97222500e5baf5bd0e9c19575c', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/malaysia-ai/dedup-malaysian-speakers', endpoint='https://huggingface.co', repo_type='dataset', repo_id='malaysia-ai/dedup-malaysian-speakers'), pr_revision=None, pr_num=None)

In [30]:
!zip -q -r -s 10000m dedup-parliament.zip dedup-parliament

In [5]:
from glob import glob

files = glob('dedup-parliament.z*')
files

['dedup-parliament.z02', 'dedup-parliament.zip', 'dedup-parliament.z01']

In [6]:
from huggingface_hub import HfApi
api = HfApi()

for f in files:
    print(f)
    api.upload_file(
        path_or_fileobj=f,
        path_in_repo=f,
        repo_id="malaysia-ai/dedup-malaysian-speakers",
        repo_type="dataset",
    )

dedup-parliament.z02


No files have been modified since last commit. Skipping to prevent empty commit.


dedup-parliament.zip


No files have been modified since last commit. Skipping to prevent empty commit.


dedup-parliament.z01


No files have been modified since last commit. Skipping to prevent empty commit.
