In [None]:
import pymorphy2
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
import re
import razdel
import nltk
import string
from string import punctuation
from nltk.corpus import stopwords
import wikipedia
from razdel import tokenize
from sentence_transformers import SentenceTransformer, util

In [None]:

def get_token_sen(page):
    start = re.search(r'\n{3}===? Примечания ===?', page)
    start_next = re.search(r'\n{3}===? Издания ===?', page)
    if start is re.Match:
      step = start.span()[0]
      page = page[:step]
    if start_next is re.Match:
      step_next = start_next.span()[0]
      if start > start_next:
        page = page[:start_next]

    clean = re.sub('\n{3}===? [а-яА-ЯЁё0-9 ]+ ===?',' ',page) #[] добавить символы типо -/?
    #token
    return [_.text for _ in razdel.sentenize(clean)]

In [None]:
import unicodedata

def remove_accents(input_str):
    nfkd_form = unicodedata.normalize('NFKD', input_str)
    return u"".join([c for c in nfkd_form if not unicodedata.combining(c)])

remove_accents('Алекса́ндр Серге́евич Пу́шкин')

'Александр Сергеевич Пушкин'

In [None]:
def token_word_morphy(sen, morph = pymorphy2.MorphAnalyzer()):
  return [morph.parse(_.text)[0].normal_form for _ in razdel.tokenize(sen)] 

In [None]:
def noise():
    return (stopwords.words('russian') + list(punctuation + 'IIVVIX«»\n№́'))

In [None]:
def cosine_sim_2_sen(sen1, sen2, vectorizer = CountVectorizer(tokenizer = token_word_morphy, stop_words = noise())):
  lst_sen_score = []
  vectorizer = vectorizer.fit_transform([sen1,sen2])
  vectors = vectorizer.toarray()
  return cosine_similarity(vectors[0].reshape(1,-1), vectors[1].reshape(1,-1))
  #return cosine_similarity(vectors)

In [None]:
# cosine_sim_score считает для каждого предложение из документа косинусное значение между утрвежденим 
# claim и выводит отсортированный список с наибольшими k-значениями (k=top)
# есть другая тф айдф но там тогда утрвеждение и документ подается
# claim - str
# page - str
# min_value - int - косиносное значение, при котором не рассматриваем предложения
# top - int - вывод топ k-элементов списка с наибольшим значеним косинуса (в данном случае k=top)
def cosine_sim_score(claim, page, min_value = 0, top = None,\
                     vectorizer = TfidfVectorizer(tokenizer = token_word_morphy, stop_words=noise())):
  lst_sen_score = []
  #if score_for_2_sen == 0:
  text = get_token_sen(page)
  text = list(map(remove_accents, text))
  #return text
  #else: text = page
  text.insert(0, claim)
  #print(len(text))
  vectorizer = vectorizer.fit_transform(text)
  vectors = vectorizer.toarray()
  for i in range(1, len(vectors)):
    score = cosine_similarity(vectors[0].reshape(1,-1), vectors[i].reshape(1,-1))
    if score > min_value:
      lst_sen_score.append([i, text[i], score])
  lst_sen_score.sort(key= lambda x: x[2], reverse=True)
  if top is not None:
    return lst_sen_score[:top]
  else: return lst_sen_score

In [None]:
#'DeepPavlov/rubert-base-cased'
def rubert_cosine_sim_score(claim, page, min_value = 0.24, top = None,\
                            model = SentenceTransformer('distiluse-base-multilingual-cased-v1')):
  lst_sen_score = []

  text = get_token_sen(page)
  text = list(map(remove_accents, text))
  text.insert(0, claim)

  embeddings = model.encode(text, convert_to_tensor=True)
  for i in range(1, len(embeddings)):
    score = util.pytorch_cos_sim(embeddings[0], embeddings[i])
    #print(text[i], score)
    #break
    if score > min_value:
      lst_sen_score.append([i, text[i], score])
  lst_sen_score.sort(key= lambda x: x[2], reverse=True)
  if top is not None:
    return lst_sen_score[:top]
  else: return lst_sen_score

HBox(children=(FloatProgress(value=0.0, max=504594086.0), HTML(value='')))




In [None]:
def top_l_sentences(claim, corpus_articles, top_sen = 15):
  wikipedia.set_lang("ru")
  lst_sen = []
  for article in corpus_articles:
    complete_content = wikipedia.page(article)
    page = complete_content.content
    lst_sen.append(rubert_cosine_sim_score(claim, page, top = top_sen))
  return lst_sen

In [None]:
import plotly
import plotly.graph_objs as go
import plotly.express as px
import matplotlib.pyplot as plt 
#fig.add_trace(go.Scatter(x=[i for i in range(1,len(y)+1)], y=y))
fig = go.Figure()
x = ['номер 23','номер 1', 'номер 3', 'номер 40', 'номер 27']
fig.add_trace(go.Scatter(x=x, y=score_bert, 
                         mode='markers+text',
                          marker=dict(color=score_bert, colorbar=dict(),colorscale='plasma', size=9)))
fig.add_trace(go.Scatter(
    x=x,
    y=score_bert,
    mode='markers+text',
    text=arr_bert,
    textposition=["bottom right", "bottom right","top center","bottom left","bottom left"],
    textfont = {'size':9.5} ,
    marker=dict(color=score_bert, colorbar=dict(),colorscale='plasma', size=9)

))

fig.update_layout(
                  legend=dict(x=.5, xanchor="center"),
                  xaxis_title="Номера предложений из статьи",
                  yaxis_title="Значение косинусного сходства",
                  width=1200, #1100 tf
                  height=500,
                  margin=dict(l=0, r=0, t=30, b=0), showlegend = False
                  )
fig.show()