In [None]:
import pandas as pd
pd.options.display.float_format = '{:20,.2f}'.format
from datasketch import MinHash, MinHashLSH, MinHashLSHForest, MinHashLSHEnsemble
from nltk import ngrams
from tqdm import tqdm
import numpy as np
import itertools
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report, roc_auc_score
import seaborn as sn
import matplotlib.pyplot as plt

In [None]:
df = pd.read_csv('Dataset_duplicate_train.csv')
df.head()

In [None]:
%%time
# that accepts MinHash objects with 128 permutations functions
data = df['text'].values
num_perm = 128
lsh = MinHashLSH(threshold=0.8, num_perm=num_perm)
 
# Create MinHash objects
minhashes = {}
error = []
for c, i in enumerate(tqdm(data)):
    try:
        if c%5000 == 0:
            print(c)
        minhash = MinHash(num_perm=num_perm)
        for d in ngrams(i, 16):
            minhash.update("".join(d).encode('utf-8'))
        lsh.insert(c, minhash)
        minhashes[c] = minhash
    except:
        error.append(c)
        pass 

    duplicate = []
for i in range(len(minhashes.keys())):
    try:
        result = lsh.query(minhashes[i])
        if len(result) > 1:
            result.sort()
            duplicate.append(result)
    except:
        pass
duplicate.sort()
duplicate = list(duplicate for duplicate, _ in itertools.groupby(duplicate))
delete = []
for value in duplicate:
    delete.append(value[1:])
delete = list(itertools.chain(*delete))

In [None]:
df['predict'] = np.where(df.index.isin(delete), 1, 0)
plt.figure(figsize = (10,7))
ax = plt.gca()
ax.get_xaxis().get_major_formatter().set_scientific(False)
labels = ['deduplicate', 'no-deduplicate']
print(roc_auc_score(df['deduplicate'], df['predict']))
sn.heatmap(confusion_matrix(df['deduplicate'], df['predict']), annot=True);

## Evaluation test

In [None]:
df = pd.read_csv('Dataset_duplicate_test.csv')
df_result = deduplicate_model(df_test, num_perm=128, ngram=16, threshold=0.6)
roc_auc_score(df_result['deduplicate'], df_result['predict'])

In [None]:
plt.figure(figsize = (10,7))
ax = plt.gca()
ax.get_xaxis().get_major_formatter().set_scientific(False)
labels = ['deduplicate', 'no-deduplicate']
sn.heatmap(confusion_matrix(df_result['deduplicate'], df_result['predict']), annot=True);

In [None]:
print(classification_report(df['deduplicate'], df['predict']))