In [0]:
%load_ext autoreload

from gensim.models.keyedvectors import KeyedVectors
from gensim.models import Word2Vec

In [118]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [119]:
!pip install pymorphy2



In [0]:
import numpy as np
import os
import csv
import json
import re
import pymorphy2 as pm2 
pmm = pm2.MorphAnalyzer()
from operator import itemgetter

In [0]:
def clean_text(text): ## функция очистки текста и токенизация
    text = re.sub(r'[^\w\s]','',text) 
    text = [pmm.normal_forms(x)[0] for x in text.split()] 
    return text

In [0]:
## открываем файл с запросами и документами
with open('/content/drive/My Drive/quora_question_pairs_rus.csv', 'r', encoding='utf-8') as q:
    str_corpus = csv.reader(q)
    file = list(str_corpus)
    file = file[1:]

In [0]:
## загружаем готовый корпус документов, созданный в прошлом дз
with open("/content/drive/My Drive/doccorpus.json", "r", encoding = 'utf-8') as c:
    doc_corpus = json.load(c) 

In [0]:
d_docs = {} ## словарь с документами и 0 или 1 для них
for i in file:
  d_docs[i[2]] = i[3]

In [0]:
docs = [] ## массив с документами
for i in file:
    docs.append([i[2]])

In [0]:
queries = [] ## получаем массив с лемматизированными запросами
for i in file:
    queries.append(' '.join(clean_text(i[1])))

In [0]:
## сохраняем массив с лемм-ми запросами в json,
## чтоб не собирать корпус запросов каждый раз заново
with open('/content/drive/My Drive/simple_elmo/que_corpus.json', "w", encoding = 'utf-8') as qc: 
    json.dump(queries, qc)

In [0]:
## достаем массив с запросами из json
with open("/content/drive/My Drive/simple_elmo/que_corpus.json", "r", encoding = 'utf-8') as qc:
    queries = json.load(qc)

## **Задача 1. Начнем с fasttext**

In [0]:
##  достаем fasttext модель
ft_model_file = '/content/drive/My Drive/fasttext/model.model' 
ft_model = KeyedVectors.load(ft_model_file)

In [0]:
## функция создания матрицы по модели fasttext
def create_ft_matrix(doc_corpus):
  matrix = []
  for i in doc_corpus:
    lemmas = i.split()
    lemmas_vectors = np.zeros((len(lemmas), ft_model.vector_size))
    vec = np.zeros((ft_model.vector_size,))

    for idx, lemma in enumerate(lemmas):
      if lemma in ft_model.vocab:
        lemmas_vectors[idx] = ft_model.wv[lemma]
        
    if lemmas_vectors.shape[0] is not 0:
      vec = np.mean(lemmas_vectors, axis=0)
    matrix.append(vec)
  return  np.array(matrix)

In [0]:
%%time 
## смотрим время индексации модели fasttext ~ 26-30 сек

ft_matrix = create_ft_matrix(doc_corpus)

  # Remove the CWD from sys.path while we load stuff.


CPU times: user 27.1 s, sys: 3.63 s, total: 30.7 s
Wall time: 30.7 s


In [0]:
def query2vec(query): ## функция создания вектора для запроса по модели fasttext
  for i in query:
    lemmas = i.split()
    lemmas_vectors = np.zeros((len(lemmas), ft_model.vector_size))
    vec = np.zeros((ft_model.vector_size,))

    for idx, lemma in enumerate(lemmas):
      if lemma in ft_model.vocab:
        lemmas_vectors[idx] = ft_model.wv[lemma]
        
    if lemmas_vectors.shape[0] is not 0:
      vec =  np.array(np.mean(lemmas_vectors, axis=0))
    return vec

In [0]:
def cos_sim(v1, v2): ## считает косинусную близость между двумя векторами
   return np.inner(v1, v2) / (np.linalg.norm(v1) * np.linalg.norm(v2))

In [0]:
def ft_search(query, ft_matrix, docs): ## функция поиска по модели fasttext
  response = []
  vec = query2vec(query)
  for idx, doc in enumerate(docs):
    if idx < len(ft_matrix):
      doc_score = cos_sim(vec, ft_matrix[idx])
      response.append((docs[idx], doc_score))
  response = sorted(response,key=itemgetter(1), reverse = True)
  return response

In [0]:
# функция проверки точности модели fasttext
# n - количество запросов, на которых проверяем
def check_tf_precision(d_docs, queries, n):
  prec_arr = []
  for i in range(1, n+1):
    response = ft_search(queries[i], ft_matrix, docs)
    top5 = response[:5]
    good_responces = 0
    for d in top5:
      if int(d_docs[d[0][0]]) == 1:
        good_responces += 1
    prec_arr.append(good_responces/len(top5))
  return prec_arr

## **Теперь elmo**
Нет, его не будет, потому что он никак не хочет работать!

In [123]:
!wget "http://vectors.nlpl.eu/repository/11/196.zip"

--2019-10-05 21:15:26--  http://vectors.nlpl.eu/repository/11/196.zip
Resolving vectors.nlpl.eu (vectors.nlpl.eu)... 129.240.189.225
Connecting to vectors.nlpl.eu (vectors.nlpl.eu)|129.240.189.225|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 206986345 (197M) [application/zip]
Saving to: ‘196.zip.1’


2019-10-05 21:15:39 (16.4 MB/s) - ‘196.zip.1’ saved [206986345/206986345]



In [124]:
!unzip '196.zip' -d 'elmo'

Archive:  196.zip
  inflating: elmo/meta.json          
  inflating: elmo/model.hdf5         
  inflating: elmo/options.json       
  inflating: elmo/README             
  inflating: elmo/vocab.txt          


In [134]:
%load_ext autoreload

import time
import numpy as np
import tensorflow as tf
from elmo_helpers import tokenize, get_elmo_vectors, load_elmo_embeddings

tf.reset_default_graph()
elmo_path = 'elmo'

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


ModuleNotFoundError: ignored

In [0]:
batcher, sentence_character_ids, elmo_sentence_input = load_elmo_embeddings(elmo_path)

NameError: ignored

**Сравнение с bm25**

In [0]:
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer()

In [0]:
corpusik = doc_corpus[:10000]

In [0]:
X = vectorizer.fit_transform(corpusik)
f_matrix = X.toarray()

In [141]:
## Создание матрицы tf-ов
doc_matrix = np.transpose(f_matrix)
doc_matrix

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [0]:
def matr_bm25(all_n, docs_len, doc_matrix, corpus, doc_words, avgdl, N):
    k = 2.0
    b = 0.75
    bm_matrix = []
    for idx, word in enumerate(doc_words):
        bm = 0
        bm_matrix.append([])
        for doc in corpus:
            if word in doc:
                w_idx = doc_words.index(word)
                d_idx = corpus.index(doc)
                TF = doc_matrix[w_idx][d_idx]
                n = all_n[word]
                IDF = log((N-n+0.5)/(n+0.5))          
                l_d = docs_len[doc]
                bm = IDF * ((TF * (k+1))/(TF + k * (1 - b + (b * (l_d/avgdl)))))
            bm_matrix[idx].append(bm)
    return bm_matrix 

In [0]:
def vect_bm25(query, k, b, doc_words):
    vect = []
    IDF = 1.0986122886681096 ## посчитала отдельно
    q_words = [pmm.normal_forms(x)[0] for x in query.split()]
    for word in doc_words:
        bm = 0
        if word in q_words:
            l_d = len(q_words)
            TF = 1/l_d
            bm = IDF * ((TF * (k+1))/(TF + k))
        vect.append(bm)
    vect = np.array(vect)
    return vect

In [0]:
from math import log

doc_words = vectorizer.get_feature_names()
all_n = {}
for word in doc_words:
    w_idx = doc_words.index(word)
    all_n[word] = np.count_nonzero(doc_matrix[w_idx])
    
docs_len = {}
whole_len = 0
for doc in corpusik:
    doc_len = len(doc.split())
    docs_len[doc] = doc_len
whole_len += doc_len

N = len(corpusik) 
avgdl = whole_len/N

In [0]:
bm_matrix = matr_bm25(all_n, docs_len, doc_matrix, corpusik, doc_words, avgdl, N)
bm_matrix = np.array(bm_matrix)

In [0]:
def search(query, bm_matrix):
    doc_words = vectorizer.get_feature_names()
    k = 2.0
    b = 0.75
    q_vect = vect_bm25(query, k, b, doc_words)
    doc_score = q_vect.dot(bm_matrix)
    response = list(zip(docs, doc_score))
    response = sorted(response,key=itemgetter(1), reverse = True)
    return response

In [0]:
# функция проверки точности bm25
# n - количество запросов, на которых проверяем
def check_bm25_precision(d_docs, queries, n):
  prec_arr = []
  for i in range(1, n+1):
    response = search(query, bm_matrix)
    top5 = response[:5]
    good_responces = 0
    for d in top5:
      if int(d_docs[d[0][0]]) == 1:
        good_responces += 1
    prec_arr.append(good_responces/len(top5))
  return prec_arr

**Теперь посмотрим точности для топ5 результатов для бм25 и fasttext**

In [0]:
## Смотрим массив с точностями работы модели FASTTEXT на n запросах
## Итог - везде стабильно только 1 из топ 5 документов удовлетворяет запросу...
## ну, критерию удовлетворяет...

ft_prec_arr = check_tf_precision(d_docs, queries, n=10)
ft_prec_arr

  


[0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2]

In [157]:
bm25_prec_arr = check_tf_precision(d_docs, queries, n=10)
bm25_prec_arr

  


[0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2]