In [1]:
import pandas as pd

from xxhash import xxh64_intdigest
from datasketch import MinHash, MinHashLSH
from sentence_transformers import SentenceTransformer 
from sklearn.metrics.pairwise import cosine_similarity
from tqdm import tqdm

In [2]:
LSH_THRESHOLD = 0.8
NUM_PERMS = 128
SHINGLE_SIZE = 3
SBERT_THRESHOLD = 0.9

In [3]:
def shingle(string, shingle_size):
    string = string[:500]
    shings = {string[i : i + shingle_size] for i in range(len(string) - shingle_size + 1)}
    return set(shings)

In [13]:
def create_cand_pairs(data):
    min_dict = dict()

    for idx, text in tqdm(data.items()):
        shingles = shingle(str(text), SHINGLE_SIZE)
        mhash = MinHash(num_perm=NUM_PERMS, hashfunc=xxh64_intdigest)
        for shing in shingles:
            mhash.update(shing.encode("utf8"))
        min_dict[idx] = mhash

    lsh_high = MinHashLSH(threshold=0.8, num_perm=NUM_PERMS)
    for key in tqdm(min_dict.keys()):
        lsh_high.insert(key,min_dict[key])

    lsh_low = MinHashLSH(threshold=0.55, num_perm=NUM_PERMS)
    for key in tqdm(min_dict.keys()):
        lsh_low.insert(key,min_dict[key])

    cand_list_high = []
    for query in min_dict.keys():
        bucket = lsh_high.query(min_dict[query])
        if len(bucket) > 1:
            first_val = bucket[0]
            for val in bucket[1:]:
                second_val = val
                if [first_val,second_val] not in cand_list_high:
                    cand_list_high.append([first_val,second_val])

    cand_list_low = []
    for query in min_dict.keys():
        bucket = lsh_low.query(min_dict[query])
        if len(bucket) > 1:
            first_val = bucket[0]
            for val in bucket[1:]:
                second_val = val
                if ([first_val,second_val] not in cand_list_low) and ([first_val,second_val]) not in cand_list_high:
                    cand_list_low.append([first_val,second_val])

    return cand_list_high, cand_list_low

In [173]:
def drop_near_duplicates(data):
    min_dict = dict()

    for idx, text in tqdm(data.items()):
        shingles = shingle(str(text), SHINGLE_SIZE)
        mhash = MinHash(num_perm=NUM_PERMS, hashfunc=xxh64_intdigest)
        for shing in shingles:
            mhash.update(shing.encode("utf8"))
        min_dict[idx] = mhash

    lsh_high = MinHashLSH(threshold=0.8, num_perm=NUM_PERMS)
    for key in tqdm(min_dict.keys()):
        lsh_high.insert(key,min_dict[key])

    lsh_low = MinHashLSH(threshold=0.55, num_perm=NUM_PERMS)
    for key in tqdm(min_dict.keys()):
        lsh_low.insert(key,min_dict[key])

    cand_list_high = []
    for query in min_dict.keys():
        bucket = lsh_high.query(min_dict[query])
        if len(bucket) > 1:
            first_val = bucket[0]
            for val in bucket[1:]:
                second_val = val
                if [first_val,second_val] not in cand_list_high:
                    cand_list_high.append([first_val,second_val])

    cand_list_low = []
    for query in min_dict.keys():
        bucket = lsh_low.query(min_dict[query])
        if len(bucket) > 1:
            first_val = bucket[0]
            for val in bucket[1:]:
                second_val = val
                if ([first_val,second_val] not in cand_list_low) and ([first_val,second_val]) not in cand_list_high:
                    cand_list_low.append([first_val,second_val])
    
    drop_list = []
    for i_candidate in cand_list_low:
        sent_vec_0 = dedup_model.encode(data[i_candidate[0]])
        sent_vec_1 = dedup_model.encode(data[i_candidate[1]])
        sent_score = cosine_similarity([sent_vec_0],[sent_vec_1]).tolist()[0][0]
        if sent_score > SBERT_THRESHOLD:
            if len(data[i_candidate[0]]) <= len(data[i_candidate[1]]):
                drop_list.append(i_candidate[0])
            else:
                drop_list.append(i_candidate[1])

    for i_candidate in cand_list_high:
        if len(data[i_candidate[0]]) <= len(data[i_candidate[1]]):
            drop_list.append(i_candidate[0])
        else:
            drop_list.append(i_candidate[1])

    drop_list = list(set(drop_list))

    return drop_list

In [36]:
def check_similarity(cand_list, model, data):
    drop_list = []
    for i_candidate in cand_list:
        sent_vec_0 = model.encode(data[i_candidate[0]])
        sent_vec_1 = model.encode(data[i_candidate[1]])
        sent_score = cosine_similarity([sent_vec_0],[sent_vec_1]).tolist()[0][0]
        if sent_score > SBERT_THRESHOLD:
            if len(data[i_candidate[0]]) <= len(data[i_candidate[1]]):
                drop_list.append(i_candidate[0])
            else:
                drop_list.append(i_candidate[1])
    #print(len(list(set(drop_list))))
    return list(set(drop_list))

In [5]:
dedup_model = SentenceTransformer("../../../fastapi_news/news_analyse/models/intfloat/multilingual-e5-small/intfloat_multilingual-e5-small/")

In [6]:
df = pd.read_csv('../../raw_data/news.csv')

In [7]:
df = df.sample(5000)

In [90]:
df = df.drop_duplicates(subset='clean_text')

In [8]:
df.shape

(5000, 10)

In [92]:
df = df.dropna(subset='text')

In [183]:
df = df.drop(drop_near_duplicates(df['clean_text']), axis=0)

5000it [00:08, 562.70it/s]
100%|██████████| 5000/5000 [00:00<00:00, 69365.63it/s]
100%|██████████| 5000/5000 [00:00<00:00, 9696.65it/s]
