In [11]:
import time
import requests
import xml.etree.ElementTree as ET
from typing import Any, Dict, List

import pandas as pd
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

In [2]:
def load_xml_from_url(url: str) -> pd.DataFrame:
    r = requests.get(url)

    root = ET.fromstring(r.text)
    records = root.find('records') 

    def extract(child):
        parts = []
        for sub in child:
            if len(sub) != 0:
                parts.append(extract(sub))
                continue

            if sub.tag.lower() in ('_face','_font','_size'):
                continue

            if sub.text and sub.text.strip():
                parts.append(sub.text.strip())
        text = ','.join(parts)

        return text

    rows = []
    for rec in records.findall('record'):
        row = {}
        for child in rec:
            text = ''
            tag = child.tag
            if len(child) == 0:
                text = (child.text or '').strip()
            else:
                text = extract(child)

            row[tag] = text
        rows.append(row)

    return pd.DataFrame(rows)

In [10]:
class Benchmark:
    '''
    A class for benchmarking classification models.

    Args:
        model (Any): A model object that has a `predict` method.
    '''

    def __init__(self, model: Any, average: str = 'binary'):
        self.model = model
        self.average = average

    def evaluate(self, X, y_true) -> Dict[str, float]:
        '''
        Calculates classification metrics and prediction time.

        Args:
            X: Input features.
            y_true: True labels.

        Returns:
            dict: A dictionary containing metrics and prediction time.
        '''
        start_time = time.time()
        y_pred = self.model.predict(X)
        end_time = time.time()

        metrics = {
            'accuracy': accuracy_score(y_true, y_pred),
            'precision': precision_score(y_true, y_pred, average='binary', zero_division=0),
            'recall': recall_score(y_true, y_pred, average='binary', zero_division=0),
            'f1': f1_score(y_true, y_pred, average='binary', zero_division=0),
            'prediction_time_sec': end_time - start_time,
            'number_of_samples': len(y_true),
        }

        return metrics

In [64]:
class MinHashDeduplicator:
    '''
    A class to deduplicate articles using MinHash
    '''
    def __init__(self, k=10, treshold=0.51):
        self.k = k
        self.treshold = treshold

    def predict(self, articles: pd.DataFrame) -> List[int]:
        # change each row to set without hashed values, without NaNs
        rows = []
        for _, row in articles.iterrows():
            row = row.dropna().apply(hash).to_list()
            row.sort()
            rows.append(set(row[:self.k]))

        result = [0 for _ in rows]
        # for each pair of rows calculate estimate
        for i in range(len(result)):
            for j in range(1, len(result)):
                X = set(sorted(rows[i] | rows[j])[:self.k])
                Y = X & rows[i] & rows[j]
                if len(Y) / self.k > self.treshold:
                    result[i] = 1
                    result[j] = 1

        return result



In [85]:
datasets = [
    "tafenoquine",
    "uti",
    "diabetes",
    "copper",
    "blue-light",
]

deduplicator = MinHashDeduplicator(treshold=0.80)
benchmark = Benchmark(deduplicator)

for dataset in datasets:
    url = "https://raw.githubusercontent.com/IEBH/dedupe-sweep/master/test/data/" + dataset + ".xml"
    df = load_xml_from_url(url)
    df['label'] = df["caption"].apply(lambda x: 1 if x == "Duplicate" else 0)
    print(f"{dataset}: {benchmark.evaluate(df.drop(columns=["caption", "label"]), df["label"])}")

tafenoquine: {'accuracy': 0.6480446927374302, 'precision': 0.6440677966101694, 'recall': 1.0, 'f1': 0.7835051546391752, 'prediction_time_sec': 0.07968568801879883, 'number_of_samples': 179}
uti: {'accuracy': 0.4429530201342282, 'precision': 0.40523560209424087, 'recall': 0.9675, 'f1': 0.5712177121771218, 'prediction_time_sec': 2.1759092807769775, 'number_of_samples': 1043}
diabetes: {'accuracy': 0.5066334991708126, 'precision': 0.4905379228132916, 'recall': 0.9561428986349114, 'f1': 0.6484144179633642, 'prediction_time_sec': 107.7034056186676, 'number_of_samples': 7236}
copper: {'accuracy': 0.5544554455445545, 'precision': 0.5557768924302788, 'recall': 0.9928825622775801, 'f1': 0.7126436781609196, 'prediction_time_sec': 0.5865988731384277, 'number_of_samples': 505}
blue-light: {'accuracy': 0.5573394495412844, 'precision': 0.5565217391304348, 'recall': 0.939203354297694, 'f1': 0.6989079563182528, 'prediction_time_sec': 1.6579370498657227, 'number_of_samples': 872}
