In [1]:
from sentence_transformers import SentenceTransformer

model = SentenceTransformer('./sbert_from_mlm_bert_mix_2')

In [2]:
import nltk
import pandas as pd

with open('../data/interim/oshhamaho/кавказский пленник.rus.txt', 'r') as f:
    text_rus = f.read()
    
with open('../data/interim/oshhamaho/кавказский пленник.kbd.txt', 'r') as f:
    text_kbd = f.read()


df_rus = pd.DataFrame(nltk.sent_tokenize(text_rus), columns=['sent'])
df_rus.rename(columns={'Unnamed: 0': 'sent'}, inplace=True)
df_kbd = pd.DataFrame(nltk.sent_tokenize(text_kbd), columns=['sent'])
df_kbd.rename(columns={'Unnamed: 0': 'sent'}, inplace=True)

df_rus['vector'] = df_rus['sent'].apply(lambda x: model.encode(x))
df_kbd['vector'] = df_kbd['sent'].apply(lambda x: model.encode(x))

In [3]:
import numpy as np

def cosine_similarity(v1, v2):
    return np.dot(v1, v2) / (np.linalg.norm(v1) * np.linalg.norm(v2))

window = 10
output_file = '../data/interim/oshhamaho/кавказский пленник.rus_kbd.draft3.txt'

# Сравнение текстов
with open(output_file, 'w') as f:
    for i, (rus_sent, rus_vector) in enumerate(zip(df_rus['sent'], df_rus['vector'])):
        f.write(f'{rus_sent}\n')
        
        start = max(0, i - window)
        end = min(len(df_kbd), i + window)
        
        similarities = [
            (kbd_sent, cosine_similarity(rus_vector, kbd_vector))
            for kbd_sent, kbd_vector in zip(df_kbd['sent'][start:end], df_kbd['vector'][start:end])
        ]

        for kbd, score in sorted(similarities, key=lambda x: x[1], reverse=True):
            f.write(f'\t{score:.2f} {kbd}\n')
        
        f.write('\n')