<a href="https://colab.research.google.com/github/luchia0602/OOP/blob/main/%D0%9F%D0%A0%D0%9E%D0%95%D0%9A%D0%A2_%D1%8F%D0%BF%D0%BE%D0%BD%D1%81%D0%BA%D0%B8%D0%B9.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip3 install razdel
!pip3 install sentence-transformers
!pip install py-hasami

In [None]:
import re
import seaborn as sns
import numpy as np

from scipy import spatial
from matplotlib import pyplot as plt

import razdel
from sentence_transformers import SentenceTransformer

In [None]:
with open('HP_eng_9.txt', 'r', encoding='utf-8') as file:
  en_text = file.read()
  en = re.sub('\n', ' ', en_text)
  sent_en = list(x.text for x in razdel.sentenize(en))
  print("English text:", len(sent_en), "sentences")
  print(sent_en[0])

English text: 388 sentences
Harry had never believed he would meet a boy he hated more than Dudley, but that was before he met Draco Malfoy.


In [None]:
import hasami
with open('HP JAP.txt', 'r', encoding='utf-8') as file:
  ja_text = file.read()
  ja = re.sub('\n', ' ', ja_text)
  sent_ja = hasami.segment_sentences(ja) # получаем список предложений
  print("Japanese text:", len(sent_ja), "sentences")
  print(sent_ja[0])

Japanese text: 317 sentences
ダドリーより嫌なヤツがこの世の中にいるなんて、ハリーは思ってもみなかった。


In [None]:
def get_batch(iter1, iter2, batch_size):
    l1 = len(iter1)
    l2 = len(iter2)
    k = int(round(batch_size * l2/l1))
    kdx = 0 - k
    for ndx in range(0, l1, batch_size):
        kdx += k
        yield iter1[ndx:min(ndx + batch_size, l1)], iter2[kdx:min(kdx + k, l2)]

In [None]:
model_st = SentenceTransformer('distiluse-base-multilingual-cased')

In [None]:
batch_number = 0
total_pairs = 0
batch_size = 388
window = 8
threshold = 0.3

In [None]:
def get_sim_matrix(vec1, vec2, window=10):
    sim_matrix=np.zeros((len(vec1), len(vec2)))
    k = len(vec1)/len(vec2)
    for i in range(len(vec1)):
        for j in range(len(vec2)):
            if (j*k > i-window) & (j*k < i+window):
              sim = 1 - spatial.distance.cosine(vec1[i], vec2[j])
              sim_matrix[i,j] = sim
    return sim_matrix

In [None]:
vectors1, vectors2 = [], []

for lines_en_batch, lines_ja_batch in get_batch(sent_en, sent_ja, batch_size):
    batch_number += 1
    vectors1 = [*vectors1, *model_st.encode(lines_en_batch)]
    vectors2 = [*vectors2, *model_st.encode(lines_ja_batch)]

    break

In [None]:
sim_matrix = get_sim_matrix(vectors1, vectors2, window)

In [None]:
plt.figure(figsize=(8,8))
sns.heatmap(sim_matrix, cmap="Greens", vmin=threshold)
plt.xlabel("english", fontsize=18)
plt.ylabel("japanese", fontsize=18)
plt.show()

In [None]:
def get_pairs(en_lines, ja_lines, sim_matrix, threshold):
    en = []
    ja = []
    sims = []
    for i in range(sim_matrix.shape[0]):
        for j in range(sim_matrix.shape[1]):
            if sim_matrix[i,j] >= threshold:
                en.append(en_lines[j])
                ja.append(ja_lines[i])
                sims.append(sim_matrix[i,j])
    return en, ja, sims

In [None]:
sim_matrix_best = np.zeros_like(sim_matrix)
sim_matrix_best[range(len(sim_matrix)), sim_matrix.argmax(1)] = sim_matrix[range(len(sim_matrix)), sim_matrix.argmax(1)]

In [None]:
plt.figure(figsize=(8,8))
sns.heatmap(sim_matrix_best, cmap="Reds", vmin=threshold)
plt.xlabel("english", fontsize=18)
plt.ylabel("japanese", fontsize=18)
plt.show()

In [None]:
res_ja, res_en, sims = get_pairs(sent_ja, sent_en, sim_matrix_best, threshold)

In [None]:
for x, y, s in zip(res_en, res_ja, sims):
    print(x)
    print(y)
    print("")
    print(">> similarity", s, "\n\n")

In [None]:
print(len(res_en))
print(len(res_ja))

269
269


In [None]:
import pandas as pd
data = pd.DataFrame({'ja': res_ja, 'en': res_en, 'sim': sims})
data

Unnamed: 0,ja,en,sim
0,ダドリーより嫌なヤツがこの世の中にいるなんて、ハリーは思ってもみなかった。,Harry had never believed he would meet a boy h...,0.661952
1,一年生ではグリフィンドールとスリザリンが一緒のクラスになるのは魔法薬学の授業だけだったので、...,"Still, first-year Gryffindors only had Potions...",0.502783
2,少なくとも、グリフィンドールの談話室に「お知らせ」が出るまではそうだった。,"Or at least, they didn’t until they spotted a ...",0.512457
3,――飛行訓練は木曜日に始まります。,Flying lessons would be starting on Thursday —...,0.599966
4,ダドリーより嫌なヤツがこの世の中にいるなんて、ハリーは思ってもみなかった。,"“Typical,” said Harry darkly.",0.396301
...,...,...,...
264,ベッドに入ってからそれを考えていた。,But Hermione had given Harry something else to...,0.417430
265,犬が何かを守っている……ハグリッドが何て言ったっけ？,The dog was guarding something. . . .,0.590029
266,犬が何かを守っている……ハグリッドが何て言ったっけ？,What had Hagrid said?,0.516516
267,「グリンゴッツは何かを隠すには世界で一番安全な場所だ――たぶんホグワーツ以外では……」 七一...,Gringotts was the safest place in the world fo...,0.669144


In [None]:
data.to_excel('ENG_JAP_corpora.xlsx')