# Efficient Duplicate Detection with Embeddings and FAISS

In [None]:
import time
import requests
import xml.etree.ElementTree as ET
from typing import List, Any, Dict

import faiss
import numpy as np
import pandas as pd
from sentence_transformers import SentenceTransformer
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

  from .autonotebook import tqdm as notebook_tqdm


In [18]:
def load_xml_from_url(url: str) -> pd.DataFrame:
    r = requests.get(url)

    root = ET.fromstring(r.text)
    records = root.find('records') 

    def extract(child):
        parts = []
        for sub in child:
            if len(sub) != 0:
                parts.append(extract(sub))
                continue

            if sub.tag.lower() in ('_face','_font','_size'):
                continue

            if sub.text and sub.text.strip():
                parts.append(sub.text.strip())
        text = ','.join(parts)

        return text

    rows = []
    for rec in records.findall('record'):
        row = {}
        for child in rec:
            text = ''
            tag = child.tag
            if len(child) == 0:
                text = (child.text or '').strip()
            else:
                text = extract(child)

            row[tag] = text
        rows.append(row)

    return pd.DataFrame(rows)

In [19]:
class Benchmark:
    '''
    A class for benchmarking classification models.

    Args:
        model (Any): A model object that has a `predict` method.
    '''

    def __init__(self, model: Any, average: str = 'binary'):
        self.model = model
        self.average = average

    def evaluate(self, X, y_true) -> Dict[str, float]:
        '''
        Calculates classification metrics and prediction time.

        Args:
            X: Input features.
            y_true: True labels.

        Returns:
            dict: A dictionary containing metrics and prediction time.
        '''
        start_time = time.time()
        y_pred = self.model.predict(X)
        end_time = time.time()

        metrics = {
            'accuracy': accuracy_score(y_true, y_pred),
            'precision': precision_score(y_true, y_pred, average='binary', zero_division=0),
            'recall': recall_score(y_true, y_pred, average='binary', zero_division=0),
            'f1': f1_score(y_true, y_pred, average='binary', zero_division=0),
            'prediction_time_sec': end_time - start_time,
            'number_of_samples': len(y_true),
        }

        return metrics

In [None]:
from concurrent.futures import ThreadPoolExecutor

class EmbeddingDeduplicator:
    '''
    A class to deduplicate text embeddings using FAISS.
    '''
    def __init__(self, model_name: str = 'sentence-transformers/all-MiniLM-L6-v2', dimension: int = 384, top_k: int = 5, threshold: float = 0.85):
        self.model = SentenceTransformer(model_name, cache_folder='.cache')
        self.index = faiss.IndexFlatIP(dimension)
        self.top_k = top_k
        self.threshold = threshold
    
    def predict(self, texts: List[str]) -> List[int]:
        embeddings = self.model.encode(texts, show_progress_bar=False, normalize_embeddings=True)
        self.index.add(embeddings)

        similarities, neighbors = self.index.search(embeddings, self.top_k)

        duplicates = set()

        for i in range(len(texts)):
            for j, sim in zip(neighbors[i][1:], similarities[i][1:]):
                if sim > self.threshold:
                        duplicates.add(i)
                        duplicates.add(j)

        indices = np.zeros(len(texts), dtype=int)
        indices[list(duplicates)] = 1
        return indices

In [21]:
url = 'https://raw.githubusercontent.com/IEBH/dedupe-sweep/master/test/data/tafenoquine.xml'

df_tafenoquine = load_xml_from_url(url)
df_tafenoquine['label'] = df_tafenoquine['caption'].apply(lambda x: 1 if x == 'Duplicate' else 0)

In [69]:
deduplicator = EmbeddingDeduplicator(threshold=0.95, top_k=3)
benchmark = Benchmark(deduplicator)
benchmark.evaluate(df_tafenoquine['abstract'].to_list(), df_tafenoquine['label'])

{'accuracy': 0.9553072625698324,
 'precision': 0.9344262295081968,
 'recall': 1.0,
 'f1': 0.9661016949152542,
 'prediction_time_sec': 0.5236790180206299,
 'number_of_samples': 179}

In [23]:
url = 'https://raw.githubusercontent.com/IEBH/dedupe-sweep/master/test/data/uti.xml'

df_uti = load_xml_from_url(url)
df_uti['label'] = df_uti['caption'].apply(lambda x: 1 if x == 'Duplicate' else 0)

In [70]:
deduplicator = EmbeddingDeduplicator(threshold=0.95, top_k=3)
benchmark = Benchmark(deduplicator)
benchmark.evaluate(df_uti['abstract'].to_list(), df_uti['label'])

{'accuracy': 0.9463087248322147,
 'precision': 0.9257425742574258,
 'recall': 0.935,
 'f1': 0.9303482587064676,
 'prediction_time_sec': 2.4804320335388184,
 'number_of_samples': 1043}

In [25]:
url = 'https://raw.githubusercontent.com/IEBH/dedupe-sweep/master/test/data/diabetes.xml'

df_diabetes = load_xml_from_url(url)
df_diabetes['label'] = df_diabetes['caption'].apply(lambda x: 1 if x == 'Duplicate' else 0)
df_diabetes = df_diabetes.dropna(subset=['abstract'])

In [71]:
deduplicator = EmbeddingDeduplicator(threshold=0.95, top_k=3)
benchmark = Benchmark(deduplicator)
benchmark.evaluate(df_diabetes['abstract'].to_list(), df_diabetes['label'])

{'accuracy': 0.9418383749817332,
 'precision': 0.9681689253072802,
 'recall': 0.9118432769367765,
 'f1': 0.9391623356771629,
 'prediction_time_sec': 16.29113507270813,
 'number_of_samples': 6843}

In [29]:
url = 'https://raw.githubusercontent.com/IEBH/dedupe-sweep/master/test/data/copper.xml'

df_copper = load_xml_from_url(url)
df_copper['label'] = df_copper['caption'].apply(lambda x: 1 if x == 'Duplicate' else 0)
df_copper = df_copper.dropna(subset=['abstract'])

In [55]:
deduplicator = EmbeddingDeduplicator(threshold=0.95, top_k=3)
benchmark = Benchmark(deduplicator)
benchmark.evaluate(df_copper['abstract'].to_list(), df_copper['label'])

{'accuracy': 0.9859437751004017,
 'precision': 0.9755244755244755,
 'recall': 1.0,
 'f1': 0.9876106194690265,
 'prediction_time_sec': 1.1990149021148682,
 'number_of_samples': 498}

In [31]:
url = 'https://raw.githubusercontent.com/IEBH/dedupe-sweep/master/test/data/blue-light.xml'

df_blue_light = load_xml_from_url(url)
df_blue_light['label'] = df_blue_light['caption'].apply(lambda x: 1 if x == 'Duplicate' else 0)
df_blue_light = df_blue_light.dropna(subset=['abstract'])

In [56]:
deduplicator = EmbeddingDeduplicator(threshold=0.95, top_k=3)
benchmark = Benchmark(deduplicator)
benchmark.evaluate(df_blue_light['abstract'].to_list(), df_blue_light['label'])

{'accuracy': 0.9392059553349876,
 'precision': 0.9814385150812065,
 'recall': 0.9116379310344828,
 'f1': 0.9452513966480447,
 'prediction_time_sec': 1.8415331840515137,
 'number_of_samples': 806}