# Efficient Duplicate Detection with Embeddings and FAISS

In [1]:
import os
import sys
from typing import List

ROOT_DIR = os.path.dirname(os.getcwd())
sys.path.append(ROOT_DIR)

import faiss
import numpy as np
from sentence_transformers import SentenceTransformer

from utils import *

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
df = prepare_dataset()

Loading datasets: 100%|██████████| 5/5 [00:04<00:00,  1.24it/s]


In [3]:
class EmbeddingDeduplicator:
    '''
    A class to deduplicate text embeddings using FAISS.
    '''
    def __init__(self, model_name: str = 'sentence-transformers/all-MiniLM-L6-v2', dimension: int = 384, top_k: int = 5, threshold: float = 0.85):
        self.model = SentenceTransformer(model_name, cache_folder='.cache')
        self.dimension = dimension
        self.top_k = top_k
        self.threshold = threshold
    
    def predict(self, texts: List[str]) -> List[int]:
        embeddings = self.model.encode(texts, show_progress_bar=False, normalize_embeddings=True)
        
        index = faiss.IndexFlatIP(self.dimension)
        index.add(embeddings)

        similarities, neighbors = index.search(embeddings, self.top_k)

        duplicates = set()

        for i in range(len(texts)):
            for j, sim in zip(neighbors[i][1:], similarities[i][1:]):
                if sim > self.threshold:
                        duplicates.add(i)
                        duplicates.add(j)

        indices = np.zeros(len(texts), dtype=int)
        indices[list(duplicates)] = 1
        return indices

In [4]:
deduplicator = EmbeddingDeduplicator(threshold=0.95, top_k=3)
benchmark = Benchmark(deduplicator)
benchmark.evaluate(df['abstract'].to_list(), df['label'])

{'accuracy': 0.9459195161035377,
 'precision': 0.9690495374916931,
 'recall': 0.921263248972552,
 'f1': 0.9445523848937178,
 'prediction_time_sec': 21.910506010055542,
 'samples': 9347,
 'duplicates': 4623}