In [1]:
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import CountVectorizer

def calculate_similarity_score(sent1, sent2, embedding1, embedding2, alpha=0.7, beta=0.2, gamma=0.1):
    cosine_sim = cosine_similarity([embedding1], [embedding2])[0][0]
    
    length_diff = abs(len(sent1) - len(sent2)) / (len(sent1) + len(sent2))
    
    vectorizer = CountVectorizer().fit([sent1, sent2])
    vectors = vectorizer.transform([sent1, sent2])
    word_overlap = vectors[0].dot(vectors[1].T).toarray()[0][0] / (vectors[0].sum() + vectors[1].sum() - vectors[0].dot(vectors[1].T).toarray()[0][0])
    
    similarity_score = alpha * cosine_sim + beta * (1 - length_diff) + gamma * word_overlap

    return similarity_score

model = SentenceTransformer('./sbert_from_mlm_bert_80')

sent1 = '– Тхьэ соIуэ, си пыIэкур къивудакIэ!'
sent2 = '– Тхьэ соIуэ, ткIуэпс сIумыхуакIэ!'

calculate_similarity_score(
    sent1=sent1,
    sent2=sent2,
    embedding1=model.encode(sent1),
    embedding2=model.encode(sent2)
)

0.9050441099916186

In [2]:
from sentence_transformers import SentencesDataset, InputExample, losses
from torch.utils.data import DataLoader

In [3]:
import pandas as pd
from tqdm import tqdm

def prepare_data_for_training(sent_pairs_df, model):
    train_examples = []

    for sent_1, sent_2 in tqdm(sent_pairs_df[['sent1', 'sent2']].values):
        score = calculate_similarity_score(
            sent1=sent1,
            sent2=sent2,
            embedding1=model.encode(sent1),
            embedding2=model.encode(sent2)
        )

        train_examples.append(
            InputExample(texts=[sent_1, sent_2], label=round(float(score), 2))
        )
    return train_examples

In [4]:
def run_training(from_model_path: str, to_model_path: str, train_examples):
    model = SentenceTransformer(from_model_path)

    # Создание и загрузка датасета
    train_dataset = SentencesDataset(examples=train_examples, model=model)
    train_dataloader = DataLoader(train_dataset, batch_size=8, shuffle=True)

    # Настройка процесса обучения
    train_loss = losses.CosineSimilarityLoss(model=model)

    # Обучение модели
    model.fit(
        train_objectives=[(train_dataloader, train_loss)],
        epochs=1
    )
    model.save(to_model_path)

    return to_model_path

In [5]:
from sentence_transformers import SentenceTransformer
import nltk
from sklearn.cluster import AgglomerativeClustering
from collections import defaultdict
import random
from tqdm import tqdm
import os


def get_sents(text_path: str, len_min: int, len_max: int):
    with open(text_path, 'r') as f:
        text = f.read()

    sents = sorted(set([
        sent.replace('\n', ' ')
        for sent in nltk.sent_tokenize(text)
        if len_min < len(sent) < len_max
    ]))
    return sents


def get_clusters(vectors, n_clusters=20):
    agg_clustering = AgglomerativeClustering(n_clusters=n_clusters)
    labels = agg_clustering.fit_predict(vectors)
    return labels


def get_sents_by_clusters(words, labels):
    sents_by_clusters = defaultdict(list)
    for i, label in enumerate(labels):
        sents_by_clusters[label].append(words[i])

    return sents_by_clusters


def clusterize_sents(sents, model, version, butch_size=10000, cluster_num=1000):
    cluster_factor = butch_size / cluster_num

    for seed in range(111, 115):
        export_path = f'../data/processed/sent_clusters_{version}/seed_{seed}/{cluster_factor}_{butch_size}_{cluster_num}'
        os.makedirs(export_path, exist_ok=True)

        random.shuffle(sents)

        for offset in tqdm(range(0, len(sents), butch_size)):
            butch_sents = sents[offset:offset + butch_size]
            if len(butch_sents) < cluster_num:
                break

            word_vectors = [
                model.encode(sent)
                for sent in butch_sents
            ]
            labels = get_clusters(word_vectors, n_clusters=cluster_num)
            sents_by_clusters = get_sents_by_clusters(butch_sents, labels)

            for cluster_label, cluster_sents in sents_by_clusters.items():
                with open(f'{export_path}/cluster_{offset}_{offset + butch_size}_{cluster_label}.txt', 'w') as f:
                    f.write('\n'.join(cluster_sents))


In [6]:
from sklearn.cluster import KMeans


def split_sent_by_cluster(sentences, model):
    if len(sentences) < 8:
        return {0: sentences}

    embeddings = model.encode(sentences)

    clustering_model = KMeans(n_clusters=4, n_init=20, max_iter=1000)
    clustering_model.fit(embeddings)
    cluster_assignment = clustering_model.labels_

    clusters = defaultdict(list)
    for sentence, cluster_id in zip(sentences, cluster_assignment):
        clusters[cluster_id].append(sentence)

    return clusters


def select_sent_from_clusters(cluster_version, model):
    clusters_path = f'../data/processed/sent_clusters_{cluster_version}/'

    sent_pairs = []
    for seed_dir in os.listdir(clusters_path):
        seed_path = f'{clusters_path}/{seed_dir}'
        if not os.path.isdir(seed_path):
            continue

        for cluster_dir in os.listdir(seed_path):
            clusters_path = f'{clusters_path}/{seed_dir}/{cluster_dir}'
            if not os.path.isdir(clusters_path):
                continue

            anti_pairs = []
            for cluster_file in tqdm(os.listdir(clusters_path)):
                if not cluster_file.endswith('.txt'):
                    continue

                cluster_file_path = f'{clusters_path}/{cluster_file}'
                with open(cluster_file_path, 'r') as f:
                    sents = f.read().split('\n')

                if len(sents) < 2:
                    continue

                for group, sentences in split_sent_by_cluster(
                        sents, model
                ).items():
                    for s1 in sentences:
                        for s2 in sentences:
                            if s1 == s2:
                                continue
                            sent_pairs.append(('pos', s1, s2)
                                              )

                if len(sent_pairs) < 1000:
                    continue

                for sent in sents:
                    for _ in range(len(sents)):
                        anti_pairs.append(('neg', sent, random.choice(sent_pairs)[2]))

            sent_pairs.extend(anti_pairs)

    sent_pairs_df = pd.DataFrame(sent_pairs, columns=['type', 'sent1', 'sent2'])
    sent_pairs_df.drop_duplicates(subset=['sent1', 'sent2'], inplace=True)
    sent_pairs_df['final_score'] = sent_pairs_df['type'].apply(lambda x: 1 if x == 'pos' else 0)
    
    slice_df = pd.DataFrame(
        sent_pairs_df.groupby('sent1').apply(lambda x: x.sample(n=min(len(x), 5), random_state=1)).reset_index(drop=True)
    )

    slice_df.to_csv(f'../data/processed/sent_pairs_{cluster_version}.csv', index=False)

In [None]:
for iteration in range(80, 85):
    from_model_path = f'./sbert_from_mlm_bert_{iteration}'
    to_model_path = f'./sbert_from_mlm_bert_{iteration + 1}'

    model_from = SentenceTransformer(from_model_path)

    sentence = get_sents('../data/processed/oshhamaho.txt', 30, 40)
    clusterize_sents(sents=sentence, model=model_from, version=iteration, butch_size=5000, cluster_num=500)
    select_sent_from_clusters(cluster_version=iteration, model=model_from)

    sent_pairs_df = pd.read_csv(f'../data/processed/sent_pairs_{iteration}.csv')
    train_examples = prepare_data_for_training(sent_pairs_df, model_from)
    run_training(from_model_path, to_model_path, train_examples)

  2%|▏         | 2553/139843 [00:34<30:35, 74.81it/s]