## ДЗ по поиску

Привет! Вам надо реализивать поисковик на базе вопросов-ответов с сайта [pravoved.ru](https://pravoved.ru/questions-archive/).        
Поиск должен работать на трех технологиях:       
1. обратном индексе     
2. word2vec         
3. doc2vec      

Вы должны понять, какой метод и при каких условиях эксперимента на этом корпусе работает лучше.          
Для измерения качества поиска найдите точность (accuracy) выпадания правильного ответа на конкретный вопрос (в этой базе у каждого вопроса есть только один правильный ответ). Точность нужно измерить для всей базы.    
При этом давайте считать, что выпал правильный ответ, если он попал в **топ-5** поисковой выдачи.

> Сделайте ваш поиск максимально качественным, чтобы значение точности стремилось к 1.     
Для этого можно поэкспериментировать со следующим:       
- модель word2vec (можно брать любую из опен сорса или обучить свою)
- способ получения вектора документа через word2vec: простое среднее арифметическое или взвешивать каждый вектор в соответствии с его tf-idf      
- количество эпох у doc2vec (начинайте от 100)
- предобработка документов для обучения doc2vec (удалять / не удалять стоп-слова)
- блендинг методов поиска: соединить результаты обратного индекса и w2v, или (что проще) w2v и d2v

На это задание отведем 10 дней. Дэдлайн сдачи до полуночи 12.10.

In [638]:
from gensim.models import Word2Vec, KeyedVectors
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import string
import judicial_splitter
import collections
from pymystem3 import Mystem
mystem = Mystem()
import numpy as np
from itertools import groupby
from tqdm import tqdm_notebook as tqdm
import os
import pandas as pd
from gensim.test.utils import get_tmpfile
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from nltk.tokenize import word_tokenize
import operator
import re
from sklearn.feature_extraction.text import CountVectorizer
import warnings
warnings.filterwarnings("ignore")
import math
from model import Model
import sys
if sys.version_info[0] < 3: 
    from StringIO import StringIO
else:
    from io import StringIO
from operator import itemgetter 

In [528]:
import pickle

with open('qa_corpus.pkl', 'rb') as file:
    qa_corpus = pickle.load(file)

Всего в корпусе 1384 пары вопрос-ответ

In [529]:
len(qa_corpus)

1384

Первый элемент блока это вопрос, второй - ответ на него

In [530]:
qa_corpus[0]

['\nДобрый день.Мой сын гражданин Украины (ДНР),имеет вид на жительство в Р.Ф., кот.получил проживая с 2014 г. в Нижегородской области.В 2017г. переехал на постоянное место жительство в г.Ростов.Официально трудоустроился на одно из промышл.предприятий г.Ростова.Оформил временную регистрацию в Ростове.В УФМС предупредили,что по истечении 90 дней он должен либо постоянно прописаться либо покинуть территорию России.Прошу проконсультировать как быть дальше.(Вернуться домой в Донецк,но здесь идет война,работы нет.В Ростове он работает по специальности.Он инженер машиностроитель.)Временная прописка до 15 марта.  Если он сможет приобрести какую либо недвижимость,как долго будет решаться вопрос о его постоянной прописке в Ростове.Как в этом случае будет решаться вопрос с видом на жительство в Ростове? Не получится ли ,что приобретя квартиру,он не успеет в ней прописаться до окончании срока временной регистрации. С уважением Людмила Евгеньевна.\n',
 'Добрый вечер!Из Вашего вопроса вообще ничего

In [5]:
questions, answers = [], []
for qa in qa_corpus:
    question = qa[0]
    answer = qa[1]
    questions.append(question)
    answers.append(answer)

In [531]:
def preprocessing(input_text, del_stopwords=True, del_digit=True):
    """
    :input: raw text
        1. lowercase, del punctuation, tokenize
        2. normal form
        3. del stopwords
        4. del digits
    :return: lemmas
    """
    russian_stopwords = set(stopwords.words('russian'))
    if del_digit:
        input_text = re.sub('[0-9]', '', input_text)
    words = [x.lower().strip(string.punctuation+'»«–…') for x in word_tokenize(input_text)]
    lemmas = [mystem.lemmatize(x)[0] for x in words if x]

    lemmas_arr = []
    for lemma in lemmas:
        if del_stopwords:
            if lemma in russian_stopwords:
                continue
        lemmas_arr.append(lemma)
    return lemmas_arr

# TF.IDF

In [532]:
def compute_tf(text):
    tf_text = collections.Counter(text)
    for i in tf_text:
        tf_text[i] = tf_text[i]/float(len(text))
    return tf_text

In [533]:
def compute_idf(word, corpus):
    return math.log10(len(corpus)/sum([1.0 for i in corpus if word in i]))

In [536]:
def compute_tfidf(corpus):
    documents_list = []
    for text in tqdm(corpus):
        tf_idf_dictionary = {}
        computed_tf = compute_tf(text)
        for word in computed_tf:
            tf_idf_dictionary[word] = computed_tf[word] * compute_idf(word, corpus)
        documents_list.append(tf_idf_dictionary)
    return documents_list

#corpus = [['pasta', 'la', 'vista', 'baby', 'la', 'vista'], 
#['hasta', 'siempre', 'comandante', 'baby', 'la', 'siempre'], 
#['siempre', 'comandante', 'baby', 'la', 'siempre']]
#compute_tfidf(corpus)

In [640]:
def compute_new_tfidf(word, quary, corpus):
    try:
        quary = preprocessing(quary)
        computed_tf = compute_tf(quary)[word]
        tfidf = computed_tf * compute_idf(word, corpus)
    except:
        tfidf = 0.0
    return tfidf

In [642]:
compute_new_tfidf('бульдог', 'коричневый бульдог', corpus)

1.72108304289236

In [636]:
#corpus = [preprocessing(q) for q in questions] + [preprocessing(a) for a in answers]
#tfidf = compute_tfidf(corpus)

In [538]:
tfidf_questions = tfidf[0:len(questions)]
tfidf_answers = tfidf[len(questions):]

# w2v

In [539]:
# если модель без тэгов
model_without_pos = Word2Vec.load('/Users/irene/Downloads/IR/araneum_none_fasttextcbow_300_5_2018/araneum_none_fasttextcbow_300_5_2018.model')

In [611]:
def get_w2v_vectors_paragraph(paragraph, model, tfidf, ind, multiply_tfidf=True, pos=False):
    """Получает вектор для параграфа"""
    lemmas_paragraph = preprocessing(paragraph)
    #print('lemmas_paragraph', lemmas_paragraph)
    if len(lemmas_paragraph) == 0:
        return np.zeros(300)
    else:
        vector_paragraph = []
        for lemma in lemmas_paragraph:
            if pos:
                lemma = lemma + '_' + get_pos(lemma)
            try:
                if multiply_tfidf:
                    tfidf = compute_new_tfidf(lemma, paragraph, corpus)
                    vector = model.wv[lemma] * tfidf
                else:
                    vector = model.wv[lemma]
            except:
                vector = np.zeros(300)
            vector_paragraph.append(vector)
        vec = np.array(vector_paragraph).sum(axis=0) / len(vector_paragraph)
        return vec.tolist()

In [541]:
def get_w2v_vectors_text(text, model, tfidf, ind, len_par=4, multiply_tfidf=True, pos=False):
    """Получает массив векторов параграфов"""
    paragraphs = judicial_splitter.splitter(text, len_par)
    return [(paragraph, get_w2v_vectors_paragraph(paragraph, model, tfidf, ind, multiply_tfidf=multiply_tfidf, pos=pos)) for paragraph in paragraphs]

In [542]:
def save_w2v_base(answers, model, tfidf, len_par=4, multiply_tfidf=True, pos=False):
    """Индексирует всю базу для поиска через word2vec"""
    id_answer = []
    text_of_paragraph = []
    w2v = []
    for i, answer in tqdm(enumerate(answers)):
        v_paragraphs = get_w2v_vectors_text(answer, model, tfidf, i, len_par=len_par, multiply_tfidf=multiply_tfidf, pos=pos)
        for v_p in v_paragraphs:
            id_answer.append(i)
            text_of_paragraph.append(v_p[0])
            w2v.append(v_p[1])
    return id_answer, text_of_paragraph, w2v

In [543]:
def create_df(name_cols, data_cols):
    df = {}
    for i, name in enumerate(name_cols):
        df[name] = data_cols[i]
    df = pd.DataFrame(data=df)
    return df

# БЕЗ TF.IDF

In [544]:
id_answer, text_of_paragraph, w2v = save_w2v_base(answers, model_without_pos, tfidf_answers, len_par=2, multiply_tfidf=False)

A Jupyter Widget

In [545]:
df = create_df(['id_answer', 'text_of_paragraph', 'w2v'], [id_answer, text_of_paragraph, w2v])

# С TF.IDF

In [546]:
id_answer_tfidf, text_of_paragraph_tfidf, w2v_tfidf = save_w2v_base(answers, model_without_pos, tfidf_answers, len_par=2, multiply_tfidf=True)
df['w2v_tfidf'] = w2v_tfidf

A Jupyter Widget

# С POS-tagging

In [547]:
def get_pos(lemma):
    sentences = model_udpipe.tokenize(lemma)
    for s in sentences:
        model_udpipe.tag(s)
        model_udpipe.parse(s)
    conllu = model_udpipe.write(sentences, "conllu")
    conllu = re.sub('# .+?\n', '', conllu)
    pos = re.search('(.+?\t){3}(.+?)\t', conllu).group(2)
    return pos

In [549]:
# модель с тегами
model_pos = KeyedVectors.load_word2vec_format('/Users/irene/Downloads/IR/tayga_1_2.vec', binary=False)
model_udpipe = Model('russian-ud-2.0-170801.udpipe')
id_answer_pos, text_of_paragraph_pos, w2v_pos = save_w2v_base(answers, model_pos, tfidf_answers, len_par=2, multiply_tfidf=False, pos=True)
df['w2v_pos'] = w2v_pos
id_answer_pos_tfidf, text_of_paragraph_pos_tfidf, w2v_pos_tfidf = save_w2v_base(answers, model_pos, tfidf_answers, len_par=2, pos=True)
df['w2v_pos_tfidf'] = w2v_pos

A Jupyter Widget

A Jupyter Widget

# train my model

https://drive.google.com/file/d/14wqRshysIn0Cs4dDM_WaW4jPTtix1sgD/view?usp=sharing

In [550]:
train_text = [preprocessing(ans_1) for ans_1 in answers]
path = get_tmpfile("my_word2vec.model")
my_model = Word2Vec(train_text, size=300, window=5, min_count=1, workers=4)
my_model.save("my_word2vec.model")
my_id_answer, my_text_of_paragraph, my_w2v = save_w2v_base(answers, my_model, tfidf_answers, len_par=2, multiply_tfidf=False)
df['my_w2v'] = my_w2v

A Jupyter Widget

# d2v

In [551]:
def train_doc2vec(data, epochs=100):
    tagged_data = [TaggedDocument(words=word_tokenize(_d.lower()), tags=[str(i)]) for i, _d in enumerate(data)]
    model = Doc2Vec(vector_size=100, min_count=5, alpha=0.025, 
                min_alpha=0.025, epochs=epochs, workers=4, dm=1)

    model.build_vocab(tagged_data)
    model.train(tagged_data, total_examples=model.corpus_count, epochs=model.epochs)
    return model

In [552]:
fname = get_tmpfile("model_doc2vec_QA")
#model_doc2vec = train_doc2vec(df['text_of_paragraph'], epochs=1000)
#model_doc2vec.save(fname)
model_doc2vec = Doc2Vec.load(fname)

In [553]:
def get_d2v_vectors(paragraph, model_doc2vec, steps=5, alpha=0.1):
    """Получает вектор параграфа"""
    lemmas_paragraph = preprocessing(paragraph, del_stopwords=False)
    model_doc2vec.random.seed(100)
    vector = model_doc2vec.infer_vector(lemmas_paragraph, steps=steps, alpha=alpha)
    return vector.tolist()

def save_d2v_base(paragraphs, model_doc2vec, steps=5, alpha=0.1):
    """Индексирует всю базу для поиска через doc2vec"""
    vectors_d2v = []
    for par in tqdm(paragraphs):
        vectors_d2v.append(get_d2v_vectors(par, model_doc2vec, steps=steps, alpha=alpha))
    return vectors_d2v

In [554]:
df['d2v_simple'] = save_d2v_base(df['text_of_paragraph'], model_doc2vec)
df['d2v_hypo'] = save_d2v_base(df['text_of_paragraph'], model_doc2vec, steps=10, alpha=0.025)

A Jupyter Widget

A Jupyter Widget

In [555]:
df.head()

Unnamed: 0,id_answer,text_of_paragraph,w2v,w2v_tfidf,w2v_pos,w2v_pos_tfidf,my_w2v,d2v_simple,d2v_hypo
0,0,Добрый вечер!Из Вашего вопроса вообще ничего н...,"[-0.0037454424891620874, 0.0019313085358589888...","[0.00010870089317904785, -4.5986711484147236e-...","[0.00604688091889808, 0.005329290515204009, -0...","[0.00604688091889808, 0.005329290515204009, -0...","[0.08620098978281021, -0.07913730293512344, -0...","[0.4394567906856537, 1.2520071268081665, -0.14...","[0.17822308838367462, 1.4375102519989014, -0.2..."
1,1,"Оксана, Вы вправе не платить налог, если являе...","[0.044678158489987255, -0.002064136316378911, ...","[0.002117195782605753, 0.00015293818622012624,...","[-0.017549448613232623, 0.008853768670621017, ...","[-0.017549448613232623, 0.008853768670621017, ...","[0.1710958331823349, -0.07223200798034668, -0....","[1.8747808933258057, 1.2592966556549072, -0.29...","[0.854407012462616, 0.8900038003921509, 0.1755..."
2,2,"Здравствуйте, Илья! Можно ли подать приложения...","[0.008203479933003202, -0.009832145156673132, ...","[0.00018833383206257192, -0.000303040981331529...","[-0.022324788393094562, 0.0005549648896050759,...","[-0.022324788393094562, 0.0005549648896050759,...","[0.16408859193325043, -0.10573263466358185, -0...","[0.4959423243999481, 1.1039215326309204, 0.742...","[0.14572197198867798, 1.0616368055343628, 0.38..."
3,2,"Лица, участвующие в деле, вправе представлять ...","[0.015275483950972557, -0.012329040095210075, ...","[0.00038809291436336935, -0.000794871943071484...","[-0.03062480833829829, 0.0029290563996053403, ...","[-0.03062480833829829, 0.0029290563996053403, ...","[0.1822725087404251, -0.08508923649787903, -0....","[-0.09858600050210953, 0.22491450607776642, -0...","[-0.3880166709423065, 0.5007378458976746, 0.07..."
4,2,"Такие документы выполняются в форме, установле...","[0.014798891730606556, -0.006705345120280981, ...","[0.000794209074229002, -0.00045195137499831617...","[-0.049720443636178974, 0.015440374696627259, ...","[-0.049720443636178974, 0.015440374696627259, ...","[0.21689358353614807, -0.0350755900144577, -0....","[0.7417151927947998, -1.1962138414382935, 1.63...","[0.43548583984375, -0.8425878286361694, 1.3399..."


In [556]:
df.to_csv('QA_df.csv')

https://drive.google.com/file/d/1CHnCTjIXWjAEmGAPW0_TCH47tm2SgASO/view?usp=sharing

# w2v + d2v (общие функции для поиска)

In [557]:
from gensim import matutils
import numpy as np 

def similarity(v1, v2):
    v1_norm = matutils.unitvec(np.array(v1))
    v2_norm = matutils.unitvec(np.array(v2))
    return np.dot(v1_norm, v2_norm)

In [558]:
def res_v(vectors, names_doc, v_quary):
    res = []
    for i, vector in enumerate(vectors):
        cos_sim = similarity(v_quary, vector)
        res.append([names_doc[i], cos_sim, i])
    res.sort(key=operator.itemgetter(1), reverse=True)
    return res

In [559]:
def res_without_dupl(res, top=5):
    res_without_dupl = set()
    inds = []
    for ind, r in enumerate(res):
        if r[0] in res_without_dupl:
            continue
        else:
            if len(res_without_dupl) == top:
                break
            res_without_dupl.add(r[0])
            inds.append(ind)
        ind += 1
    return itemgetter(*inds)(res)

In [560]:
res = [[1, 0.9, 2], [1, 0.8, 1], [2, 0.7, 1], [2, 0.5, 3], [2, 0.5, 2], [3, 0.3, 1], [3, 0.2, 2], [4, 0.2, 1], [5, 0.2, 1], [6, 0.1, 1]]

In [561]:
res_without_dupl(res, top=1)

[1, 0.9, 2]

In [562]:
def search_w2v(quary, model, vectors_w2v, names_doc, tfidf, ind, multiply_tfidf=True, pos=False, top=5):
    v_quary = get_w2v_vectors_paragraph(quary, model, tfidf, ind, multiply_tfidf=multiply_tfidf, pos=pos)
    res = res_v(vectors_w2v, names_doc, v_quary)
    res = res_without_dupl(res, top=top)
    return res

def search_d2v(quary, model, vectors_d2v, names_doc, steps=5, alpha=0.1, top=5):
    v_quary = get_d2v_vectors(quary, model, steps=steps, alpha=alpha)
    res = res_v(vectors_d2v, names_doc, v_quary)
    res = res_without_dupl(res, top=top)
    return res

# Обратный индекс

In [563]:
lemmatized_texts = []
for each_f in tqdm(answers):
    lemmatized = ' '.join([x for x in preprocessing(each_f) if x != ' '])
    lemmatized_texts.append(lemmatized)

A Jupyter Widget

In [564]:
vec = CountVectorizer()
X = vec.fit_transform(lemmatized_texts)
df_index = pd.DataFrame(X.toarray(), columns=vec.get_feature_names())
words = list(vec.get_feature_names())

In [565]:
df_index.head()

Unnamed: 0,alexandr,azbuka,base,block,buduprav,buffer,cookie,div,docs,dvd,...,январь,январялюдмилая,ярлык,ярослав,ярославский,ясно,ясный,яхта,ячейка,ящик
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [566]:
df_index.shape

(1384, 7145)

In [567]:
def inverted_index(df) -> dict:
    """
    Create inverted index by input doc collection
    :return: inverted index
    """
    files = []
    for word in df:
        sub = []
        docs = np.where(df[word] > 0)[0]
        for f in docs:
            dl = len(lemmatized_texts[f].split())
            fr = round(df[word][f]/dl, 4)
            sub.append([f, dl, fr])
        files.append(sub)
    index = pd.DataFrame(data={'Слово': words, 'Информация': files})
    return index

In [568]:
index = inverted_index(df_index)

In [569]:
index.head()

Unnamed: 0,Информация,Слово
0,"[[1317, 132, 0.0076]]",alexandr
1,"[[185, 386, 0.0026]]",azbuka
2,"[[749, 300, 0.0033]]",base
3,"[[749, 300, 0.0033]]",block
4,"[[614, 430, 0.0047]]",buduprav


In [570]:
from math import log

k1 = 2.0
b = 0.75
avgdl = round(sum([len(q.split(' ')) for q in lemmatized_texts])/len(lemmatized_texts))#средняя длина док-ов в коллекции
N = len(lemmatized_texts)

def score_BM25(qf, dl, avgdl, k1, b, N, n) -> float:
    """
    Compute similarity score between search query and documents from collection
    :return: score
    """
    score = math.log((N-n+0.5)/(n+0.5)) * (k1+1)*qf/(qf+k1*(1-b+b*(dl/avgdl)))
    return score

In [571]:
def compute_sim(lemma, inverted_index) -> float:
    """
    Compute similarity score between word in search query and all document  from collection
    :return: score
    """
    doc_list = list(inverted_index.loc[inverted_index['Слово'] == lemma]['Информация'])[0]
    #print(len(doc_list))
    relevance_dict = {}
    for doc in doc_list:
        relevance_dict[doc[0]] = score_BM25(doc[2], doc[1], avgdl, k1, b, N, len(doc_list))
    return relevance_dict

In [572]:
def get_search_result(query, top=5) -> list:
    """
    Compute sim score between search query and all documents in collection
    Collect as pair (doc_id, score)
    :param query: input text
    :return: list of lists with (doc_id, score)
    """
    query = [que for que in preprocessing(query) if que in words]
    #print(query)
    res = {}
    for word in query:
        relevance_dict = compute_sim(word, index)
        res = {k: res.get(k, 0) + relevance_dict.get(k, 0) for k in set(res) | set(relevance_dict)}
    return sorted(res.items(), key=operator.itemgetter(1), reverse=True)[0:top]

In [573]:
len(get_search_result('суд', top=1384))

353

# Blending

## * w2v + d2v

In [None]:
def blend_d2v_w2v(res_w2v, res_d2v, v, top=5):
    res_w2v = sorted(res_w2v, key = lambda x: (x[0], x[2]))
    res_d2v = sorted(res_d2v, key = lambda x: (x[0], x[2]))
    ranges = []
    for i, res3 in enumerate(res_w2v):
        new_range = res3[1] * v + res_d2v[i][1] * (1-v)
        ranges.append((res3[0], new_range))
    ranges = sorted(ranges, key = lambda x: (x[1]), reverse=True)
    ranges = res_without_dupl(ranges, top=top)
    return ranges

In [575]:
res_w2v = [[2, 0.3 , 1], [2, 0.2, 2], [2, 0.1, 3], [1, 0.1, 1], [1, 0.1, 2]]
res_d2v = [[2, 0.8, 2], [2, 0.7, 1], [1, 0.5, 1], [2, 0.2, 3], [1, 0.1, 2]]

In [576]:
blend_d2v_w2v(res_w2v, res_d2v, 0.8, top=5)

[(2, 0.37999999999999995),
 (2, 0.32),
 (1, 0.18),
 (2, 0.12000000000000001),
 (1, 0.1)]

## * w2v + inverted index

In [577]:
def norm(vector):
    a = np.asarray(vector)
    return np.interp(a, (a.min(), a.max()), (-1, +1))

In [578]:
def f(x):
    return x[1] / x[0]

In [579]:
def mean_w2v(res_w2v):
    df = pd.DataFrame(list(res_w2v))
    df = df.groupby(0)[1].agg(["count", "sum", "mean"])
    m = df['mean']
    return [(i, el) for i, el in enumerate(m)]

In [580]:
res_w2v = [[2, 0.5, 3], [2, 0.5, 2], [3, 0.3, 1], [3, 0.2, 2], [4, 0.2, 1], [5, 0.2, 1], [6, 0.1, 1], [2, 0.7, 1]]

In [581]:
mean_w2v(res_w2v)

[(0, 0.5666666666666667), (1, 0.25), (2, 0.2), (3, 0.2), (4, 0.1)]

In [582]:
def blend_w2v_index(res_w2v, res_index, v, top=5):
    res_w2v = mean_w2v(res_w2v)
    res_w2v = sorted(res_w2v, key = lambda x: (x[0]))
    res_index = sorted(res_index, key = lambda x: (x[0]))
    files_ind = [r[0] for r in res_index]
    res_w2v = np.asarray(res_w2v)[files_ind]
    res_w2v_norm = [(i, j) for i, j in enumerate(norm([l[1] for l in res_w2v]))]
    res_index_norm = [(i, j) for i, j in enumerate(norm([d[1] for d in res_index]))]
    ranges = []
    for i, res3 in enumerate(res_w2v_norm):
        new_range = res3[1] * v + res_index_norm[i][1] * (1-v)
        ranges.append((res3[0], new_range))
    return sorted(ranges, key = lambda x: (x[1]), reverse=True)[0:top]

In [583]:
res_w2v = [[0, 0.5, 3], [0, 0.5, 2], [1, 0.3, 1], [1, 0.2, 2], [2, 0.2, 1], [3, 0.2, 1], [4, 0.1, 1], [0, 0.1, 1], [54, 0.1, 1]]
res_index = [(1, 65), (0, 6), (2, 5), (4, 2), (3, 3)]

In [584]:
blend_w2v_index(res_w2v, res_index, v, top=5)

[(1, 0.8347222222222221),
 (0, -0.5192239858906526),
 (2, -0.7810846560846562),
 (3, -0.8325837742504409),
 (4, -1.0)]

# Расчет точности

In [585]:
def accuracy_index(questions, top=5):
    true = 0
    for i, q in tqdm(enumerate(questions)):
        top5 = [res[0] for res in get_search_result(q, top=top)]
        if i in top5:
            true += 1
    ACCURACY = true / len(questions)
    return ACCURACY

In [586]:
def accuracy_w2v(questions, tfidf_questions, w2v, multiply_tfidf=True, pos=False, top=5):
    true = 0
    for i, q in tqdm(enumerate(questions)):
        top5 = [res[0] for res in search_w2v(q, model_without_pos, w2v, df['id_answer'], tfidf_questions, i, multiply_tfidf=multiply_tfidf, pos=pos, top=top)]
        if i in top5:
            true += 1
    ACCURACY = true / len(questions)
    return ACCURACY

In [587]:
def accuracy_d2v(questions, model, d2v, steps=5, alpha=0.1, top=5):
    true = 0
    for i, q in tqdm(enumerate(questions)):
        top5 = [res[0] for res in search_d2v(q, model_doc2vec, d2v, df['id_answer'], steps=steps, alpha=alpha, top=top)]
        if i in top5:
            true += 1
    ACCURACY = true / len(questions)
    return ACCURACY

### Обратный индекс

In [588]:
acc_5_ind = accuracy_index(questions)
acc_10_ind = accuracy_index(questions, top=10)

A Jupyter Widget

A Jupyter Widget

In [589]:
print('Точность метода "обратный индекс" на top-5:', acc_5_ind)
print('Точность метода "обратный индекс" на top-10:', acc_10_ind)

Точность метода "обратный индекс" на top-5: 0.30852601156069365
Точность метода "обратный индекс" на top-10: 0.3916184971098266


### w2v

In [591]:
#acc_5_w2v = accuracy_w2v(questions, tfidf_questions, df['w2v'], multiply_tfidf=False, pos=False)
#acc_10_w2v = accuracy_w2v(questions, tfidf_questions, df['w2v'], multiply_tfidf=False, pos=False, top=10)

In [593]:
#my_acc_5_w2v = accuracy_w2v(questions, tfidf_questions, df['my_w2v'], multiply_tfidf=False, pos=False)
#my_acc_10_w2v = accuracy_w2v(questions, tfidf_questions, df['my_w2v'], multiply_tfidf=False, pos=False, top=10)
#acc_5_w2v_pos = accuracy_w2v(questions, tfidf_questions, df['w2v_pos'], multiply_tfidf=False, pos=True)
#acc_10_w2v_pos = accuracy_w2v(questions, tfidf_questions, df['w2v_pos'], multiply_tfidf=False, pos=True, top=10)

In [643]:
acc_5_w2v_tfidf = accuracy_w2v(questions, tfidf_questions, df['w2v_tfidf'], multiply_tfidf=True, pos=False)
acc_10_w2v_tfidf = accuracy_w2v(questions, tfidf_questions, df['w2v_tfidf'], multiply_tfidf=True, pos=False, top=10)
acc_5_w2v_pos_tfidf = accuracy_w2v(questions, tfidf_questions, df['w2v_pos_tfidf'], multiply_tfidf=True, pos=True)
acc_10_w2v_pos_tfidf = accuracy_w2v(questions, tfidf_questions, df['w2v_pos_tfidf'], multiply_tfidf=True, pos=True, top=10)

A Jupyter Widget

A Jupyter Widget

A Jupyter Widget

A Jupyter Widget

In [644]:
print('Точность метода "w2v" на top-5:', acc_5_w2v)
print('Точность метода "w2v" на top-10:', acc_5_w2v)
print('Точность метода "my w2v" на top-5:', my_acc_5_w2v)
print('Точность метода "my w2v" на top-10:', my_acc_10_w2v)
print('Точность метода "w2v+pos" на top-5:', acc_5_w2v_pos)
print('Точность метода "w2v+pos" на top-10:', acc_5_w2v_pos)
print('Точность метода "w2v+tfidf" на top-5:', acc_5_w2v_tfidf)
print('Точность метода "w2v+tfidf" на top-10:', acc_5_w2v_tfidf)
print('Точность метода "w2v+pos+tfidf" на top-5:', acc_5_w2v_pos_tfidf)
print('Точность метода "w2v+pos+tfidf" на top-10:', acc_5_w2v_pos_tfidf)

Точность метода "w2v" на top-5: 0.282514450867052
Точность метода "w2v" на top-10: 0.282514450867052
Точность метода "my w2v" на top-5: 0.002890173410404624
Точность метода "my w2v" на top-10: 0.007947976878612716
Точность метода "w2v+pos" на top-5: 0.002890173410404624
Точность метода "w2v+pos" на top-10: 0.002890173410404624
Точность метода "w2v+tfidf" на top-5: 0.3208092485549133
Точность метода "w2v+tfidf" на top-10: 0.3208092485549133
Точность метода "w2v+pos+tfidf" на top-5: 0.0036127167630057803
Точность метода "w2v+pos+tfidf" на top-10: 0.0036127167630057803


### Лучший метод - "w2v+tfidf"

### d2v

In [645]:
acc_5_d2v = accuracy_d2v(questions, model_doc2vec, df['d2v_simple'])
acc_10_d2v = accuracy_d2v(questions, model_doc2vec, df['d2v_simple'], top=10)
acc_5_d2v_hypo = accuracy_d2v(questions, model_doc2vec, df['d2v_hypo'], steps=10, alpha=0.025)
acc_10_d2v_hypo = accuracy_d2v(questions, model_doc2vec, df['d2v_hypo'], steps=10, alpha=0.025, top=10)

A Jupyter Widget

A Jupyter Widget

A Jupyter Widget

A Jupyter Widget

In [646]:
print('Точность метода "d2v" на top-5:', acc_5_d2v)
print('Точность метода "d2v" на top-10:', acc_10_d2v)
print('Точность метода "d2v+гиперпараметры" на top-5:', acc_5_d2v_hypo)
print('Точность метода "d2v+гиперпараметры" на top-10:', acc_10_d2v_hypo)

Точность метода "d2v" на top-5: 0.13945086705202311
Точность метода "d2v" на top-10: 0.1611271676300578
Точность метода "d2v+гиперпараметры" на top-5: 0.16329479768786126
Точность метода "d2v+гиперпараметры" на top-10: 0.19291907514450868


### Лучший метод - "d2v+гиперпараметры"

### Возьмем лучшие модели w2v и d2v

In [647]:
def accuracy_blending(v, questions, tfidf_questions, w2v, d2v, multiply_tfidf=True, pos=False, top=1384, steps=5, alpha=0.1):
    true = 0
    for i, q in enumerate(tqdm(questions)):
        top_w2v = [(res1[0], res1[1], res1[2]) for res1 in search_w2v(q, model_without_pos, w2v, df['id_answer'], tfidf_questions, i, multiply_tfidf=multiply_tfidf, pos=pos, top=len(df['id_answer']))]
        top_d2v = [(res2[0], res2[1], res2[2]) for res2 in search_d2v(q, model_doc2vec, d2v, df['id_answer'], steps=steps, alpha=alpha, top=len(df['id_answer']))]
        #print(top_w2v)
        top = [res3[0] for res3 in blend_d2v_w2v(top_w2v, top_d2v, v, top=5)]
        if i in top:
            true += 1
    ACCURACY = true / len(questions)
    return ACCURACY

In [648]:
for v in np.linspace(0.1, 0.9, 10):
    acc_blend = accuracy_blending(v, questions, tfidf_questions, df['w2v_tfidf'], df['d2v_hypo'], multiply_tfidf=True, pos=False, top=5, steps=5, alpha=0.1)
    print('Точность метода "d2v+w2v" на top-5 с весами', str(round(v, 1)), 'и', str(round(1-v, 1)), ':', acc_blend)

A Jupyter Widget

BrokenPipeError: [Errno 32] Broken pipe

### index + w2v

In [525]:
def accuracy_ind_w2v(v, questions, tfidf_questions, w2v, multiply_tfidf=True, pos=False, top=5):
    true = 0
    for i, q in enumerate(tqdm(questions)):
        top_w2v = [(res1[0], res1[1], res1[2]) for res1 in search_w2v(q, model_without_pos, w2v, df['id_answer'], tfidf_questions, i, multiply_tfidf=multiply_tfidf, pos=pos, top=df['id_answer'])]
        top_ind = [(res[0], res[1]) for res in get_search_result(q, top=1384)]
        top = [res3[0] for res3 in blend_w2v_index(top_w2v, top_ind, v, top=5)]
        if i in top:
            true += 1
    ACCURACY = true / len(questions)
    return ACCURACY

In [526]:
for v in np.linspace(0.1, 0.9, 10):
    acc_blend = accuracy_ind_w2v(v, questions, tfidf_questions, df['w2v_tfidf'], multiply_tfidf=True, pos=False, top=5)
    print('Точность метода "w2v+index" на top-5 с весами', str(round(v, 1)), 'и', str(round(1-v, 1)), ':', acc_blend)

A Jupyter Widget

Точность метода "w2v+index" на top-5 с весами 0.1 и 0.9 : 0.004335260115606936


A Jupyter Widget

KeyboardInterrupt: 

In [None]:
acc_blend