In [0]:
!pip install Cython numpy
!pip install git+https://github.com/lopuhin/python-adagram.git

In [0]:
!pip install pymorphy2

In [8]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [0]:
import adagram
from lxml import html
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.corpus import wordnet as wn
from pymorphy2 import MorphAnalyzer
from string import punctuation
import json, os
from collections import Counter
import numpy as np
from matplotlib import pyplot as plt
%matplotlib inline
import warnings
warnings.filterwarnings('ignore')
morph = MorphAnalyzer()
punct = punctuation+'«»—…“”*№–'
stops = set(stopwords.words('russian'))



def tokenize(text):
    
    words = [word.strip(punct) for word in text.lower().split() if word and word not in stops]
    words = [word for word in words if word]

    return words

def normalize(text):
    
    words = tokenize(text)
    words = [morph.parse(word)[0].normal_form for word in words if word]

    return words

In [26]:
from google.colab import files

uploaded = files.upload()

Saving paraphrases.xml to paraphrases.xml


## Задание № 1. Протестировать адаграм в определении перефразирования

In [0]:
import pandas as pd
corpus_xml = html.fromstring(open('paraphrases.xml', 'rb').read())
texts_1 = []
texts_2 = []
classes = []

for p in corpus_xml.xpath('//paraphrase'):
    texts_1.append(p.xpath('./value[@name="text_1"]/text()')[0])
    texts_2.append(p.xpath('./value[@name="text_2"]/text()')[0])
    classes.append(p.xpath('./value[@name="class"]/text()')[0])
    
data = pd.DataFrame({'text_1':texts_1, 'text_2':texts_2, 'label':classes})

In [0]:
data['text_1_norm'] = data['text_1'].apply(normalize)
data['text_2_norm'] = data['text_2'].apply(normalize)

In [0]:
f_t_data = []
for t in data['text_1_norm']:
  f_t_data += t
for t in data['text_2_norm']:
  f_t_data += t
f = open('first_task_corpus.txt', 'w')
f.write(' '.join(f_t_data))
f.close()

In [38]:
!adagram-train first_task_corpus.txt f_t_out.pkl

[INFO] 2020-03-05 17:40:45,063 Building dictionary...
[INFO] 2020-03-05 17:40:45,286 Done! 1839 words.
[INFO] 2020-03-05 17:40:51,478 92.70% -6.7697 0.0018 1.7/2.0 10.44 kwords/sec
[INFO] 2020-03-05 17:40:51,965 100.00% -6.7668 0.0000 1.7/2.0 10.35 kwords/sec


In [0]:
f_t_vm = adagram.VectorModel.load("f_t_out.pkl")

Векторизуйте пары текстов с помощью Адаграма, обучите любую модель и оцените качество (кросс-валидацией). 

За основу возьмите код из предыдущего семинара/домашки, только в функции get_embedding вам нужно выбирать вектор нужного значения (импользуйте model.disambiguate и model.sense_vector). Отдельные векторы усредните как и в предыдущем семинаре.

Для вытаскивания пар (целевое слово, контекстые слова) вам нужно будет написать специальную функцию.

In [0]:
# проверяте на списке из чисел, чтобы было удобно дебажить
words = [0,1,2,3,4,5,6,7,8,9]

def get_words_in_context(words, window=3):
    words_in_context = []
    for i in range(len(words)):
      l = len(words)
      s_1, e_1, s_2, e_2 = 0,0,l,l
      if i > window:
        s_1 = i - window
      if i > 0:
        e_1 = i
      if i < l - 1:
        s_2 = i + 1
      if l - i >= window:
        e_2 = s_2 + window
      word = words[i]
      context = words[s_1:e_1] + words[s_2:e_2]
      words_in_context.append([word, context])

    return words_in_context

In [78]:
# работать должно вот так
get_words_in_context(words)

[[0, [1, 2, 3]],
 [1, [0, 2, 3, 4]],
 [2, [0, 1, 3, 4, 5]],
 [3, [0, 1, 2, 4, 5, 6]],
 [4, [1, 2, 3, 5, 6, 7]],
 [5, [2, 3, 4, 6, 7, 8]],
 [6, [3, 4, 5, 7, 8, 9]],
 [7, [4, 5, 6, 8, 9]],
 [8, [5, 6, 7, 9]],
 [9, [6, 7, 8]]]

In [0]:
def get_embedding_adagram(text, model, window, dim):
    
    word2context = get_words_in_context(text, window)
    
    
    vectors = np.zeros((len(word2context), dim))
    
    for i, (word, context) in enumerate(word2context):
        
        try:
            a_m = model.disambiguate(word, context).argmax()
            v = vm.sense_vector(word, a_m)
            print(v)
            vectors[i] = v
        
        except (KeyError, ValueError):
            continue
    
    if vectors.any():
        vector = np.average(vectors, axis=0)
    else:
        vector = np.zeros((dim))
    
    return vector
        

In [0]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import StratifiedKFold

n_fold = 10
stratified_folds = StratifiedKFold(n_splits=n_fold, shuffle=True, random_state=42)

model = DecisionTreeClassifier()
grid = {
    'max_features': [50,100,150],
    'random_state': [42]
}

In [0]:
dim = 100
X_text_1_ada = np.zeros((len(data['text_1_norm']), dim))
X_text_2_ada = np.zeros((len(data['text_2_norm']), dim))

for i in range(len(data['text_1_norm'])):
    X_text_1_ada[i] = get_embedding_adagram(data['text_1_norm'][i], f_t_vm, 3, dim)
    
for i in range(len(data['text_2_norm'])):
    X_text_2_ada[i] = get_embedding_adagram(data['text_2_norm'][i], f_t_vm, 3, dim)

In [0]:
X_text_ada = np.concatenate([X_text_1_ada, X_text_2_ada], axis=1)
y = classes

In [0]:
from sklearn.model_selection import GridSearchCV
def grid_search_best(model, grid, folds, grid_train_data, grid_train_classes):
  grid_search = GridSearchCV(model, param_grid=grid, cv=folds, scoring='f1_macro')
  grid_search.fit(grid_train_data, grid_train_classes) 
  return grid_search.best_score_, grid_search.best_params_, grid_search.best_estimator_

In [0]:
score, params, estimator = grid_search_best(model, grid, stratified_folds, X_text_ada, y)

In [97]:
print(score)

0.4025943215461577


In [0]:
# запустите если не установлен ворднет
import nltk
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


True

### Задание 2. Реализовать алгоритм Леска и проверить его на реальном датасете

In [0]:
from nltk.corpus import wordnet as wn
def lesk( word, sentence ):
    bestsense = 0
    maxoverlap = 0

    synsets = wn.synsets(word)
    
    for i, syns in enumerate(synsets):
      overlap = 0
      for s in sentence:
        if s in syns.definition():
          overlap += 1
      if overlap > maxoverlap:
        bestsense = i
        maxoverlap = overlap
        
        
    return bestsense


Работать функция должна как-то так:

In [102]:
# на вход подается элемент результата работы уже написанной вами функции get_words_in_context
lesk('day', 'some point or period in time'.split()) # для примера контекст совпадает с одним из определений
# а на выходе индекс подходящего синсета

1

In [103]:
# с помощью этого индекса достаем нужный синсет
wn.synsets('day')[1].definition()

'some point or period in time'

In [0]:
corpus_wsd = []
corpus = open('corpus_wsd_50k.txt').read().split('\n\n')
for sent in corpus:
    corpus_wsd.append([s.split('\t') for s in sent.split('\n')])

**Вам нужно для каждого многозначного слова (т.е. у него есть тэг в первом поле) с помощью алгоритма Леска предсказать нужный синсет и сравнить с правильным. Посчитайте процент правильных предсказаний (accuracy).**

Если считается слишком долго, возьмите поменьше предложений (например, только тысячу)

In [0]:
def get_context(words, i, window=5):
  l = len(words)
  s_1, e_1, s_2, e_2 = 0,0,l,l
  if i > window:
    s_1 = i - window
  if i > 0:
    e_1 = i
  if i < l - 1:
    s_2 = i + 1
  if l - i >= window:
    e_2 = s_2 + window
  context = words[s_1:e_1] + words[s_2:e_2]
  return context

In [0]:
windows = [3,5,7,9]
best_acc = 0
best_window = 0
for window in windows:
  true = 0
  total = 0
  for sent in corpus_wsd[:1000]:
    words = []
    for w in sent:
      words.append(w[2])
    for i in range(len(sent)):
      w = sent[i]
      if w[0] == "":
        continue
      total += 1
      ctx = get_context(words, i, window)
      sense = lesk(w[1], ctx)
      if wn.synsets(w[1])[sense] == wn.lemma_from_key(w[0]).synset():
        true += 1
    acc = true/total
    if acc > best_acc:
      best_window = window
      best_acc = acc

In [142]:
print(best_acc, best_window)

0.5081967213114754 3


### Дополнительный балл

Если хотите заработать дополнительный балл, попробуйте улучшить алгоритм Леска любым способом (например, использовать расстояние редактирования вместо пересечения или даже вставить машинное обучение)

In [0]:
def jaccard_metric (a,b,c):
  if a+b-c == 0:
    return 0.0001
  return c/(a+b-c)

def serensen_metric(a,b,c):
  if a+b == 0:
    return 0.0001
  return 2*c/(a+b)

def simpson_metric(a,b,c):
  if min(a,b) == 0:
    return 0.0001
  return c/min(a,b)

def braun_metric(a,b,c):
  if max(a,b) == 0:
    return 0.0001
  return c/max(a,b)

def compare_words(w1, w2, w_threshold, metric):
  a = len(w1)
  b = len(w2)
  c = 0
  
  for i in range(len(w1)-1):
    first = w1[i:i+1]
    for j in range(len(w2)-1):
      if w2[j:j+1] == first:
        c += 1
        break

  return  metric(a,b,c) > w_threshold

def compare_sents(s1, s2, w_threshold, metric):
  if type(s1) != list:
    s1 = s1.split()
  if type(s2) != list:
    s2 = s2.split()

  s1 = [w for w in s1 if len(w) > 2]
  s2 = [w for w in s2 if len(w) > 2]

  c = 0
  a = len(s1)
  b = len(s2)
  for first in s1:
    for second in s2:
      if compare_words(first, second, w_threshold, metric):
        c += 1
        break
  return metric(a,b,c)

def new_lesk( word, sentence, w_threshold, metric):
    bestsense = 0
    best_metric = 0

    synsets = wn.synsets(word)
    
    for i, syns in enumerate(synsets):
      overlap = 0
      m_score = compare_sents(sentence, syns.definition(), w_threshold, metric)
      if m_score > best_metric:
        bestsense = i
        maxoverlap = overlap
        
        
    return bestsense

In [0]:
prev_best_score = best_acc
prev_best_window = best_window
best_metric = None
best_threshold = 0
best_score = 0
best_w = 0

metrics = [jaccard_metric, serensen_metric, simpson_metric, braun_metric]
w_thresholds = [0, 0.2, 0.4, 0.6, 0.8, 1]
for metric in metrics:
  for w_t in w_thresholds:
    windows = [3,5,7,9]
    best_acc = 0
    best_window = 0
    for window in windows:
      true = 0
      total = 0
      for sent in corpus_wsd[:1000]:
        words = []
        for w in sent:
          words.append(w[2])
        for i in range(len(sent)):
          w = sent[i]
          if w[0] == "":
            continue
          total += 1
          ctx = get_context(words, i, window)
          sense = new_lesk(w[1], ctx, w_t, metric)
          if wn.synsets(w[1])[sense] == wn.lemma_from_key(w[0]).synset():
            true += 1
        acc = true/total
        if acc > best_acc:
          best_window = window
          best_acc = acc
    if best_acc > prev_best_score:
      best_metric = metric
      best_threshold = w_t
      best_score = best_acc
      best_w = best_window

In [153]:
print(best_metric, best_threshold, best_score, best_w)

<function braun_metric at 0x7fa0fa607d90> 1 0.6239316239316239 3


In [156]:
prev_best_score = best_acc
print(prev_best_score, best_score)

0.5081967213114754 0.6239316239316239
