In [1]:
!pip install faiss-gpu-cu12

Collecting faiss-gpu-cu12
  Downloading faiss_gpu_cu12-1.11.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (11 kB)
Downloading faiss_gpu_cu12-1.11.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (48.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m48.0/48.0 MB[0m [31m35.5 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[?25hInstalling collected packages: faiss-gpu-cu12
Successfully installed faiss-gpu-cu12-1.11.0


In [2]:
from typing import List
import numpy as np
import faiss
from sentence_transformers import SentenceTransformer

2025-05-22 19:44:18.714620: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1747943058.887765      35 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1747943058.941416      35 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [3]:
# dataset.py
import requests
import xml.etree.ElementTree as ET
import pandas as pd
from tqdm import tqdm

DATASETS = [
    'tafenoquine',
    'uti',
    'diabetes',
    'copper',
    'blue-light',
]

BASE_URL = 'https://raw.githubusercontent.com/IEBH/dedupe-sweep/master/test/data/'

def load_xml_from_url(url: str) -> pd.DataFrame:
    '''
    Load XML data from a URL and convert it to a pandas DataFrame.
    '''
    r = requests.get(url)

    root = ET.fromstring(r.text)
    records = root.find('records') 

    def extract(child):
        parts = []
        for sub in child:
            if len(sub) != 0:
                parts.append(extract(sub))
                continue

            if sub.tag.lower() in ('_face','_font','_size'):
                continue

            if sub.text and sub.text.strip():
                parts.append(sub.text.strip())
        text = ','.join(parts)

        return text

    rows = []
    for rec in records.findall('record'):
        row = {}
        for child in rec:
            text = ''
            tag = child.tag
            if len(child) == 0:
                text = (child.text or '').strip()
            else:
                text = extract(child)

            row[tag] = text
        rows.append(row)

    return pd.DataFrame(rows)

def prepare_dataset() -> pd.DataFrame:
    '''
    Load and prepare the dataset for training and evaluation.
    '''
    dfs = [
        load_xml_from_url(f'{BASE_URL}{dataset}.xml')
        for dataset in tqdm(DATASETS, desc="Loading datasets")
    ]
    df = pd.concat(dfs, ignore_index=True)
    df['label'] = df['caption'].apply(lambda x: 1 if x == 'Duplicate' else 0)
    df.dropna(subset=['abstract'], inplace=True)

    return df


In [4]:
# benchmark.py
import time
from typing import Any, Dict, List

import numpy as np
from sklearn.utils.class_weight import compute_sample_weight
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

class Benchmark:
    '''
    A class for benchmarking classification models.

    Args:
        model (Any): A model object that has a `predict` method.
    '''

    def __init__(self, model: Any):
        self.model = model

    def evaluate(self, X: List[str], y_true: np.ndarray, verbose: bool = False) -> Dict[str, float]:
        '''
        Calculates classification metrics and prediction time.

        Args:
            X: Input features.
            y_true: True labels.
            verbose: If True, prints the summary of metrics.

        Returns:
            dict: A dictionary containing metrics and prediction time.
        '''
        start_time = time.time()
        y_pred = self.model.predict(X)
        end_time = time.time()

        sample_weights = compute_sample_weight(class_weight='balanced', y=y_true)

        metrics = {
            'accuracy': accuracy_score(y_true, y_pred, sample_weight=sample_weights),
            'precision': precision_score(y_true, y_pred, average='binary', zero_division=0, sample_weight=sample_weights),
            'recall': recall_score(y_true, y_pred, average='binary', zero_division=0, sample_weight=sample_weights),
            'f1': f1_score(y_true, y_pred, average='binary', zero_division=0, sample_weight=sample_weights),
            'prediction_time_sec': end_time - start_time,
            'samples': len(y_true),
            'duplicates': sum(y_true),
        }

        if verbose:
            print('Summary:')
            print(f"{'Metric':<20}{'Value':>15}")
            print('-' * 35)
            for metric, value in metrics.items():
                print(f'{metric.capitalize():<20}{value:>15.5f}')

        return metrics


In [5]:
df = prepare_dataset()

Loading datasets: 100%|██████████| 5/5 [00:05<00:00,  1.10s/it]


In [6]:
texts = df['abstract'].to_list()
labels = df['label'].to_list()

In [7]:
class EmbeddingDeduplicatorCPU:
    '''
    A class to deduplicate text embeddings using FAISS.
    '''
    def __init__(self, model_name: str = 'sentence-transformers/all-MiniLM-L6-v2', dimension: int = 384, top_k: int = 3, threshold: float = 0.85, bacth_size: int = 32):
        self.model = SentenceTransformer(model_name, cache_folder='.cache')
        self.dimension = dimension
        self.top_k = top_k
        self.threshold = threshold
        self.batch_size = bacth_size
    
    def predict(self, texts: List[str]) -> np.ndarray:
        embeddings = self.model.encode(texts, show_progress_bar=False, normalize_embeddings=True, batch_size=self.batch_size)
        
        index = faiss.IndexFlatIP(self.dimension)
        index.add(embeddings)

        similarities, neighbors = index.search(embeddings, self.top_k)

        duplicates = set()

        for i in range(len(texts)):
            for j, sim in zip(neighbors[i][1:], similarities[i][1:]):
                if sim > self.threshold:
                        duplicates.add(i)
                        duplicates.add(j)

        indices = np.zeros(len(texts), dtype=int)
        indices[list(duplicates)] = 1
        return indices

In [8]:
deduplicator = EmbeddingDeduplicatorCPU(
    model_name='sentence-transformers/paraphrase-MiniLM-L3-v2',
    dimension=384,
    threshold=0.96,
    top_k=3,
    bacth_size=256
)
benchmark = Benchmark(deduplicator)
metrics = benchmark.evaluate(texts, labels, verbose=True)

modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/3.83k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/629 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/69.6M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/314 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Summary:
Metric                        Value
-----------------------------------
Accuracy                    0.92749
Precision                   0.98256
Recall                      0.87043
F1                          0.92310
Prediction_time_sec        10.64631
Samples                  9347.00000
Duplicates               4623.00000


In [11]:
class EmbeddingDeduplicatorGPU:
    '''
    A class to deduplicate text embeddings using FAISS.
    '''
    def __init__(self, model_name: str = 'sentence-transformers/all-MiniLM-L6-v2', dimension: int = 384, top_k: int = 3, threshold: float = 0.85, bacth_size: int = 32):
        self.model = SentenceTransformer(model_name, cache_folder='.cache')
        self.dimension = dimension
        self.top_k = top_k
        self.threshold = threshold
        self.batch_size = bacth_size
        self.res = faiss.StandardGpuResources()
    
    def predict(self, texts: List[str]) -> np.ndarray:
        embeddings = self.model.encode(texts, show_progress_bar=False, normalize_embeddings=True, batch_size=self.batch_size)
        
        index = faiss.IndexFlatIP(self.dimension)
        index_gpu = faiss.index_cpu_to_gpu(self.res, 0, index)
        index_gpu.add(embeddings)

        similarities, neighbors = index_gpu.search(embeddings, self.top_k)

        duplicates = set()

        for i in range(len(texts)):
            for j, sim in zip(neighbors[i][1:], similarities[i][1:]):
                if sim > self.threshold:
                        duplicates.add(i)
                        duplicates.add(j)

        indices = np.zeros(len(texts), dtype=int)
        indices[list(duplicates)] = 1
        return indices

In [12]:
deduplicator = EmbeddingDeduplicatorGPU(
    model_name='sentence-transformers/paraphrase-MiniLM-L3-v2',
    dimension=384,
    threshold=0.96,
    top_k=3,
    bacth_size=256
)
benchmark = Benchmark(deduplicator)
metrics = benchmark.evaluate(texts, labels, verbose=True)

Summary:
Metric                        Value
-----------------------------------
Accuracy                    0.92749
Precision                   0.98256
Recall                      0.87043
F1                          0.92310
Prediction_time_sec        10.31810
Samples                  9347.00000
Duplicates               4623.00000
