### Реализация блока из SOTA, где используется косинусное сходство

Вместо готовой функции cosine_similarity из sklearn можно реализовать кастомную подобную функцию с использованием numpy для  

In [1]:
import numpy as np

def cosine_similarity_custom(cluster_embeddings, eps=10e-10):
    L2_norm = np.linalg.norm(cluster_embeddings, axis=1, keepdims=True)
    normalized = cluster_embeddings / np.where(L2_norm == 0, 1e-10, L2_norm)
    similarity_matrix = np.dot(normalized, normalized.T)
    similarity_matrix = np.clip(similarity_matrix, -1.0, 1.0)
    return similarity_matrix

In [2]:
from sklearn.metrics.pairwise import cosine_similarity

def find_duplicates(embeddings, cluster_labels, threshold=0.85, use_custom_function=False):
    duplicates = []
    for cluster_id in set(cluster_labels):
        if cluster_id == -1:
            continue
            
        cluster_indices = np.where(cluster_labels == cluster_id)[0]
        cluster_embeddings = embeddings[cluster_indices]

        if use_custom_function:
            sim_matrix = cosine_similarity_custom(cluster_embeddings)
        else:
            sim_matrix = cosine_similarity(cluster_embeddings)
        
        for i in range(len(sim_matrix)):
            for j in range(i + 1, len(sim_matrix)):
                if sim_matrix[i][j] > threshold:
                    duplicates.append((
                        cluster_indices[i], 
                        cluster_indices[j], 
                        sim_matrix[i][j]
                    ))
    
    return sorted(duplicates, key=lambda x: -x[2])