### Задание по проекту. 

Для его выполнения вам понадобится собранная коллекция документов и функция, составляющая обратный индекс по словам в коллекции.

Напишите функцию (или несколько отдельных логичный функций), которая по запросу  Q=q1,...,gnQ=q1,...,gn  и коллекции  DD  сортирует выдачу подходящих документов. Будем считать документ подходящим, если он содержит хотя бы одно слово из запроса (из которого удалены стоп-слова). В качестве метрики используйте Okapi BM25.
Для проверки работы функции на вашем корпусе используйте запрос каникулы на новый год и рождество. Выведите ссылки в ipynb на первые десять докуменов в отсортированной выдаче(как во втором семинаре с помощью IPython.display) и их оценку BM25. Напомню, что ссылки на документы хрянятся в самих доках под тэгом @url.

В этом случае можно использовать формулу *Okapi best match 25* ([Okapi BM25](https://ru.wikipedia.org/wiki/Okapi_BM25)). Пусть дан запрос $Q$, содержащий слова  $q_1, ... , q_n$, тогда функция BM25 даёт следующую оценку релевантности документа $D$ запросу $Q$:

$$ score(D, Q) = \sum_{i}^{n} \text{IDF}(q_i)*\frac{(k+1)*f(q_i,D)}{f(q_i,D)+k_1(1-b+b\frac{|D|}{avgdl})} $$ 
где $f(q_i,D)$ - частота слова (TF) $q_i$ в документе $D$, $|D|$ - длина документа (количество слов в нём), а *avgdl* — средняя длина документа в коллекции. 
$$$$
$k_1$ и $b$ — свободные коэффициенты, обычно их выбирают как $k_1$=2.0 и $b$=0.75.
$$$$
$\text{IDF}(q_i)$ есть обратная документная частота (IDF) слова $q_i$: 
$$\text{IDF}(q_i) = \log\frac{N-n(q_i)+0.5}{n(q_i)+0.5},$$
где $N$ - общее количество документов в коллекции, а  $n(q_i)$ — количество документов, содержащих $q_i$. 

In [51]:
#from scipy.sparse import dok_matrix
#import numpy as np
#from collections import defaultdict
#from sklearn.model_selection import train_test_split
from math import log
from collections import Counter,  defaultdict
import glob
#from pymystem3 import Mystem
from pymystem3 import Mystem
import re

In [41]:
k1 = 2.0
b = 0.75

'''qf - word frequency in the doc
dl - doc length(number of words in the doc)
avdl - medium length of doc in the collection
N - number of documents in the collection
n - number of docs with a certain word (number of indexes)'''

def score_BM25(n, qf, N, dl, avdl):
    K = compute_K(dl, avdl)
    IDF = log((N - n + 0.5) / (n + 0.5))
    frac = ((k1 + 1) * fq) / (K + fq)
    return IDF * frac

def compute_K(dl, avdl):
    return k1 * ((1-b) + b * (float(dl)/float(avdl)))


def BM25_foralldocs(query, list_of_word_massives, frequency, index_dict, BM25_for_all):
    
    avdl = 0
    
    for doc in list_of_word_massives:
        total_dl += len(doc)
        
    for n_text, doc in enumerate(list_of_word_massives):
        for word in query:
            score = score_BM25(index_dict[word], frequency[str(n_text)][word], len(list_of_word_massives),
                               len(list_of_word_massives[str(n_text)]), (total_dl/len(list_of_word_massives)))
            if str(n_text) in BM25_for_all:
                BM25_for_all[str(n_text)] += score
            else:
                BM25_for_all[str(n_text)] = score
        
    return BM25_for_all
    

    


In [53]:
#functions copied from hw1(a bit modified)
def text_to_words_massive(list_of_docs, stop_words):                                                               
    list_of_word_massives = []
    for doc in list_of_docs:
        m = Mystem()
        all_words = m.lemmatize(open(doc, 'r', encoding='UTF-8').read())
        
        all_words_clean = []
        for w in all_words: 
            if w in stop_words or re.search('[А-Яа-я]', w) == None:
                pass
            else:
                all_words_clean.append(w)
        
        '''all_words_clean = []
        for w in all_words: 
            
            if word not in stop_words:
                all_words_clean.append(w)
            else:
                pass'''
        list_of_word_massives.append(all_words_clean)
        
    return list_of_word_massives
    

def reverse_index(list_of_word_massives, index_dict):
    #функция выдающая массив индексов для каждого слова в коллекции документов
    for n_text, doc in enumerate(list_of_word_massives):
        for word in set(doc):
            index_dict.setdefault(word, [])
            if str(n_text) not in index_dict[word]:
                index_dict[word].append(str(n_text))

    return index_dict


def word_frequency(list_of_word_massives, frequency):            
    #функция, подсчитывающая частоту каждого слова для каждого документа в коллекции
    for n, doc in enumerate(list_of_word_massives):
        frequency_dict = {}
        for word in doc:
            if word in frequency_dict:
                frequency_dict[word] += 1
            else:
                frequency_dict[word] = 1
        
        for word in frequency_dict:
            frequency_dict[word] = frequency_dict[word]/len(doc)
        
        frequency[str(n)] = frequency_dict
            
    return frequency


stop_words_list = ["а","алло","без","белый","близко","более","больше","большой","будем","будет","будете","будешь","будто","буду","будут","будь","бы","бывает","бывь","был","была","были","было","быть","в","важная","важное","важные","важный","вам","вами","вас","ваш","ваша","ваше","ваши","вверх","вдали","вдруг","ведь","везде","вернуться","весь","вечер","взгляд","взять","вид","видеть","вместе","вниз","внизу","во","вода","война","вокруг","вон","вообще","вопрос","восемнадцатый","восемнадцать","восемь","восьмой","вот","впрочем","времени","время","все","всегда","всего","всем","всеми","всему","всех","всею","всю","всюду","вся","всё","второй","вы","выйти","г","где","главный","глаз","говорил","говорит","говорить","год","года","году","голова","голос","город","да","давать","давно","даже","далекий","далеко","дальше","даром","дать","два","двадцатый","двадцать","две","двенадцатый","двенадцать","дверь","двух","девятнадцатый","девятнадцать","девятый","девять","действительно","дел","делать","дело","день","деньги","десятый","десять","для","до","довольно","долго","должно","должный","дом","дорога","друг","другая","другие","других","друго","другое","другой","думать","душа","е","его","ее","ей","ему","если","есть","еще","ещё","ею","её","ж","ждать","же","жена","женщина","жизнь","жить","за","занят","занята","занято","заняты","затем","зато","зачем","здесь","земля","знать","значит","значить","и","идти","из","или","им","именно","иметь","ими","имя","иногда","их","к","каждая","каждое","каждые","каждый","кажется","казаться","как","какая","какой","кем","книга","когда","кого","ком","комната","кому","конец","конечно","которая","которого","которой","которые","который","которых","кроме","кругом","кто","куда","лежать","лет","ли","лицо","лишь","лучше","любить","люди","м","маленький","мало","мать","машина","между","меля","менее","меньше","меня","место","миллионов","мимо","минута","мир","мира","мне","много","многочисленная","многочисленное","многочисленные","многочисленный","мной","мною","мог","могут","мож","может","можно","можхо","мои","мой","мор","москва","мочь","моя","моё","мы","на","наверху","над","надо","назад","наиболее","найти","наконец","нам","нами","народ","нас","начала","начать","наш","наша","наше","наши","не","него","недавно","недалеко","нее","ней","некоторый","нельзя","нем","немного","нему","непрерывно","нередко","несколько","нет","нею","неё","ни","нибудь","ниже","низко","никакой","никогда","никто","никуда","ними","них","ничего","ничто","но","новый","нога","ночь","ну","нужно","нужный","нх","о","об","оба","обычно","один","одиннадцатый","одиннадцать","однажды","однако","одного","одной","оказаться","окно","около","он","она","они","оно","опять","особенно","остаться","от","ответить","отец","отовсюду","отсюда","очень","первый","перед","писать","плечо","по","под","подумать","пожалуйста","позже","пойти","пока","пол","получить","помнить","понимать","понять","пор","пора","после","последний","посмотреть","посреди","потом","потому","почему","почти","правда","прекрасно","при","про","просто","против","процентов","пятнадцатый","пятнадцать","пятый","пять","работа","работать","раз","разве","рано","раньше","ребенок","решить","россия","рука","русский","ряд","рядом","с","сам","сама","сами","самим","самими","самих","само","самого","самой","самом","самому","саму","самый","свет","свое","своего","своей","свои","своих","свой","свою","сделать","сеаой","себе","себя","сегодня","седьмой","сейчас","семнадцатый","семнадцать","семь","сидеть","сила","сих","сказал","сказала","сказать","сколько","слишком","слово","случай","смотреть","сначала","снова","со","собой","собою","советский","совсем","спасибо","спросить","сразу","стал","старый","стать","стол","сторона","стоять","страна","суть","считать","т","та","так","такая","также","таки","такие","такое","такой","там","твой","твоя","твоё","те","тебе","тебя","тем","теми","теперь","тех","то","тобой","тобою","товарищ","тогда","того","тоже","только","том","тому","тот","тою","третий","три","тринадцатый","тринадцать","ту","туда","тут","ты","тысяч","у","увидеть","уж","уже","улица","уметь","утро","хороший","хорошо","хотеть","хоть","хотя","хочешь","час","часто","часть","чаще","чего","человек","чем","чему","через","четвертый","четыре","четырнадцатый","четырнадцать","что","чтоб","чтобы","чуть","шестнадцатый","шестнадцать","шестой","шесть","эта","эти","этим","этими","этих","это","этого","этой","этом","этому","этот","эту","я"]

index_dict = {}
frequency_dict = {}
whole_list_of_docs = []
total_w_in_doc = {}
BM25_for_all = {}
m = Mystem()
q = input("search: ")
query = m.lemmatize(q)
#for n in range (len(query)):
    #m = Mystem()
    #query[n] = m.lemmatize(query[n].strip('!:"?,;.-\)\('))

for filename in glob.iglob('./articles/*.txt'):
    whole_list_of_docs.append(filename)
#print(whole_list_of_docs)
list_of_docs = []    
for i in range(5):
    list_of_docs.append(whole_list_of_docs[i])
#print(list_of_docs)
    
print(BM25_foralldocs(query, text_to_words_massive(list_of_docs, stop_words_list), 
                      word_frequency(text_to_words_massive(list_of_docs, stop_words_list), frequency_dict),
                      reverse_index(text_to_words_massive(list_of_docs, stop_words_list), index_dict), BM25_for_all))                                                                          



search: с новым годом клубе


KeyError: 'приобретение'