In [0]:
%load_ext autoreload

from gensim.models import Word2Vec, KeyedVectors

## Задание

Реализуйте поиск по [Quora question pairs](https://www.kaggle.com/loopdigga/quora-question-pairs-russian) на нескольких векторных моделях

    1. fasttext, модель ruscorpora_none_fasttextskipgram_300_2_2019
    2. elmo, модель ruwikiruscorpora_lemmas_elmo_1024_2019
    3. bert*, RuBERT - необязательно
   
Первые две обученные модели можно скачать на сайте [rusvectores](https://rusvectores.org/en/models/).

BERT делать необязательно, но если сделаете, 6 за курс у вас автоматом. Модель можно [найти тут](http://docs.deeppavlov.ai/en/master/features/models/bert.html).

In [11]:
!pip install pymorphy2



In [0]:
import numpy as np
import csv
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
import pymorphy2 as pm
import string
import re

In [0]:
def get_data(file):
  '''Функция из файла делает списки запросов'''
  with open(file, 'r', encoding='utf-8') as f:
        reader = csv.reader(f)
        next(reader, None)
        documents_1 = []
        documents_2 = []
        data = []
        is_dupl = []
        for row in reader:
          documents_1.append(row[1])
          documents_2.append(row[2])
          data.append([int(row[0])])
          is_dupl.append(int(row[3]))
            
  return data, documents_1, documents_2, is_dupl

data, documents_1, documents_2, is_dupl= get_data('quora_question_pairs_rus.csv')

In [5]:
import nltk
nltk.download('stopwords')
stopWords = list(stopwords.words('russian'))
morph = pm.MorphAnalyzer()

def preprocess(text : list) -> list:
    """Функция на вход получает текст и возвращает список нормализованных слов, без знаков пунктуации, без стопслов"""
    text = text.lower()
    tokenizer = RegexpTokenizer(r'\w+')
    tokens = tokenizer.tokenize(text)
    filtered_words = list(filter(lambda token: token not in stopwords.words('russian'), tokens))
    norm_words = [morph.parse(token)[0].normal_form for token in filtered_words]
    return norm_words

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


### FastText

In [15]:
!wget 'http://vectors.nlpl.eu/repository/11/181.zip'
!unzip '181.zip' -d 'fasttext'

--2019-10-08 17:32:06--  http://vectors.nlpl.eu/repository/11/181.zip
Resolving vectors.nlpl.eu (vectors.nlpl.eu)... 129.240.189.225
Connecting to vectors.nlpl.eu (vectors.nlpl.eu)|129.240.189.225|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 2622716217 (2.4G) [application/zip]
Saving to: ‘181.zip’


2019-10-08 17:34:15 (19.4 MB/s) - ‘181.zip’ saved [2622716217/2622716217]

Archive:  181.zip
  inflating: fasttext/meta.json      
  inflating: fasttext/model.model    
  inflating: fasttext/model.model.vectors_ngrams.npy  
  inflating: fasttext/model.model.vectors.npy  
  inflating: fasttext/model.model.vectors_vocab.npy  
  inflating: fasttext/README         


In [6]:
f_model = KeyedVectors.load('fasttext/model.model')

  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


In [0]:
index2doc = dict(enumerate(documents_1))
doc2index = {value : key for key, value in index2doc.items()}

In [0]:
docs_processed = [preprocess(document) for document in documents_1[:100000]]
idx2doc_n = dict(enumerate(docs_processed))

In [0]:
def fasttext_matrix(data):  
  ft_matrix = []
  for document in data:
    empty_m = np.zeros((len(document), f_model.vector_size))
    for i, element in enumerate(document):
      if element in f_model.vocab:
        empty_m[i] = f_model.wv[element]
    if empty_m.shape[0] != 0:
      vector = np.mean(empty_m, axis=0)
    ft_matrix.append(vector)
  ft_matrix = np.array(ft_matrix)
  return ft_matrix

In [10]:
%%time
ft_matrix = fasttext_matrix(docs_processed)

  import sys


CPU times: user 7.28 s, sys: 328 ms, total: 7.61 s
Wall time: 7.63 s


In [0]:
def cos_sim(v1, v2):
    return np.inner(v1, v2) / (np.linalg.norm(v1) * np.linalg.norm(v2))

In [0]:
def ft_search(query, ft_matrix, model):
  query = [preprocess(query)]
  query_vec = fasttext_matrix(query)[0]
  result = {}

  for i, document_vec in enumerate(ft_matrix):
    sim = cos_sim(query_vec, document_vec)
    result[sim] = index2doc[i]
  final_result = sorted(result.items(), key=lambda x: x[0], reverse=True)[:100]
  return final_result

In [13]:
%%time 
ft_search('в чем смысл жизни?', ft_matrix, f_model)[:5]

  import sys
  


CPU times: user 1.21 s, sys: 14.1 ms, total: 1.22 s
Wall time: 1.22 s


[(0.9999999999999999, 'в чем смысл этой жизни'),
 (0.899697839055339, 'в чем смысл или цель жизни'),
 (0.8958170229575113, 'истинный смысл жизни'),
 (0.8865399765234165, 'в чем смысл жизни только в одном слове'),
 (0.83917295080825, 'как мне найти смысл моей жизни')]

### Elmo

In [24]:
!wget 'http://vectors.nlpl.eu/repository/11/196.zip'
!unzip '196.zip' -d 'elmo'

--2019-10-08 17:40:33--  http://vectors.nlpl.eu/repository/11/196.zip
Resolving vectors.nlpl.eu (vectors.nlpl.eu)... 129.240.189.225
Connecting to vectors.nlpl.eu (vectors.nlpl.eu)|129.240.189.225|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 206986345 (197M) [application/zip]
Saving to: ‘196.zip’


2019-10-08 17:40:46 (15.1 MB/s) - ‘196.zip’ saved [206986345/206986345]

Archive:  196.zip
  inflating: elmo/meta.json          
  inflating: elmo/model.hdf5         
  inflating: elmo/options.json       
  inflating: elmo/README             
  inflating: elmo/vocab.txt          


In [27]:
!unzip 'simple_elmo.zip' 

Archive:  simple_elmo.zip
  inflating: simple_elmo/.DS_Store   
  inflating: simple_elmo/LICENSE     
  inflating: simple_elmo/requirements.txt  
  inflating: simple_elmo/bilm/elmo.py  
  inflating: simple_elmo/bilm/__init__.py  
  inflating: simple_elmo/bilm/__pycache__/model.cpython-36.pyc  
  inflating: simple_elmo/bilm/__pycache__/elmo.cpython-36.pyc  
  inflating: simple_elmo/bilm/__pycache__/data.cpython-37.pyc  
  inflating: simple_elmo/bilm/__pycache__/data.cpython-36.pyc  
  inflating: simple_elmo/bilm/__pycache__/__init__.cpython-36.pyc  
  inflating: simple_elmo/bilm/__pycache__/__init__.cpython-37.pyc  
  inflating: simple_elmo/bilm/model.py  
  inflating: simple_elmo/bilm/data.py  
  inflating: simple_elmo/elmo.ipynb  
  inflating: simple_elmo/__pycache__/elmo_helpers.cpython-36.pyc  
  inflating: simple_elmo/__pycache__/elmo_helpers.cpython-37.pyc  
  inflating: simple_elmo/README      
  inflating: simple_elmo/elmo/vocab.txt.gz  
  inflating: simple_elmo/elmo/options.jso

In [0]:
import tensorflow as tf
import os
os.chdir('/content/simple_elmo/')
from elmo_helpers import tokenize, get_elmo_vectors, load_elmo_embeddings

In [15]:
batcher, sentence_character_ids, elmo_sentence_input = load_elmo_embeddings(os.getcwd()+'/elmo/')



Instructions for updating:
This class is equivalent as tf.keras.layers.LSTMCell, and will be replaced by that in Tensorflow 2.0.

Instructions for updating:
Please use `keras.layers.RNN(cell)`, which is equivalent to this API
Instructions for updating:
Please use `layer.add_weight` method instead.
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where


In [18]:
%%time 
with tf.Session() as sess:
  sess.run(tf.global_variables_initializer())
  done = []
  for i in range(150, len(docs_processed)+1, 150):
    v_elmo = get_elmo_vectors(sess, docs_processed[i-150:i], batcher, 
                              sentence_character_ids, elmo_sentence_input)
    for v in v_elmo:
      done.append(np.mean(v[:len(docs_processed[i]), :], axis=0))


Sentences in this batch: 150
Sentences in this batch: 150
Sentences in this batch: 150
Sentences in this batch: 150
Sentences in this batch: 150
Sentences in this batch: 150
Sentences in this batch: 150
Sentences in this batch: 150
Sentences in this batch: 150
Sentences in this batch: 150
Sentences in this batch: 150
Sentences in this batch: 150
Sentences in this batch: 150
Sentences in this batch: 150
Sentences in this batch: 150
Sentences in this batch: 150
Sentences in this batch: 150
Sentences in this batch: 150
Sentences in this batch: 150
Sentences in this batch: 150
Sentences in this batch: 150
Sentences in this batch: 150
Sentences in this batch: 150
Sentences in this batch: 150
Sentences in this batch: 150
Sentences in this batch: 150
Sentences in this batch: 150
Sentences in this batch: 150
Sentences in this batch: 150
Sentences in this batch: 150
Sentences in this batch: 150
Sentences in this batch: 150
Sentences in this batch: 150
Sentences in this batch: 150
Sentences in t

CPU times: user 2min 44s, sys: 34.6 s, total: 3min 18s
Wall time: 5min 11s


### __Задача 1__:    
Сравните время индексации корпуса для каждой модели 

In [0]:
%%time
ft_matrix = fasttext_matrix(docs_processed)

  import sys


CPU times: user 6.98 s, sys: 119 ms, total: 7.1 s
Wall time: 7.11 s


Фасттекс индексируется намного быстрее, но это из-за недостатка мощности и оперативной памяти для elmo. 

### __Задача 2__:    
Выведите качество поиска для каждой модели +  BM25 для сравнения

Качество оцениваем так же, как в прошлом задании:
    - если в топ-5 результатов выдачи попал хоть один релевантный документ, выдача точная
    - если в топ-5 нет ни одного релеватного документа, выдача получает 0
   