# Переменные окружения и обработка файлов

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
DIRECTORY = 'drive/MyDrive/Informatics/Sphere@mail.ru/IR/hw_04/'
DATA_DIRECTORY = !cat $DIRECTORY/data_path.txt
DATA_DIRECTORY = DATA_DIRECTORY[0]
DATA_SPLIT_DIRECTORY = DATA_DIRECTORY + 'docs/'

In [None]:
%%time
!cp $DIRECTORY/*.tsv .
!cp $DIRECTORY/*.csv .
!cp $DIRECTORY/*.pkl .

In [None]:
import os
DATA_FILES = os.listdir(DATA_SPLIT_DIRECTORY)

# Функции

In [None]:
import time
def make_submission(submission):
    name = time.asctime() + '_submission.csv'
    submission.to_csv(name, index=False)
    return name

In [None]:
import pickle
import os

def save_model(model, filename, copy_to_drive=False):
    if not filename.endswith('.pkl'):
        filename = filename + '.pkl'
    with open(filename, 'wb') as f:
        pickle.dump(model, f) 
    byte_size = os.stat(filename).st_size
    for name in ['B', 'KB', 'MB', 'GB']:
        if byte_size > 1024:
            byte_size /= 1024
        else:
            break
    print(filename, 'saved!')
    print(byte_size, name)
    if copy_to_drive:
        !cp $filename $DIRECTORY/$filename
        print('Copy done!')


def load_model(filename):
    if not filename.endswith('.pkl'):
        filename = filename + '.pkl'
    with open(filename, 'rb') as f:
        model = pickle.load(f) 
    return model

In [None]:
from gensim.matutils import softcossim

In [None]:
def make_rank_from_embeddings(get_emb_func, get_emb_func_corpus, model_name='', save_every=100, copy_to_drive_after=True, use_bm25=False):
    rank_list = []
    last_save_query_id = -save_every - 1
    current_type = 'all'  # 'all' or 'one'
    if model_name:
        model_name += '_'

    for query_id in tqdm(range(len(queries))):
        if query_id in [207, 1204, 2265]:
            current_type = 'one'
        else:
            current_type = 'all'
        new_data = False
        
        if query_id not in query_id_to_embeddings:
            query = queries[query_id]
            query = ' '.join(query)
            query_embeddings = get_emb_func(query, query_id=query_id)
            query_id_to_embeddings[query_id] = query_embeddings
            new_data = True
        query_embeddings = query_id_to_embeddings[query_id]

        doc_ids = []
        headers = []
        scores = []
        for doc_id in query_to_docs[query_id]:
            header = docs_headers[doc_id]
            header = ' '.join(header)
            headers.append(header)
            doc_ids.append(doc_id)
            
            if current_type == 'one':
                if doc_id not in doc_id_to_embeddings:
                    header_embeddings = get_emb_func(header, doc_id=doc_id)
                    doc_id_to_embeddings[doc_id] = header_embeddings
                    new_data = True
                header_embeddings = doc_id_to_embeddings[doc_id]
                score = cosine_similarity(query_embeddings, header_embeddings)[0][0]
                scores.append(score)

        scores = np.asarray(scores)
        doc_ids = np.array(doc_ids)
        if len(headers) == 0:
            continue

        if current_type == 'all':
            if query_id not in query_id_to_header_embeddings:
                header_embeddings = get_emb_func_corpus(headers, query_id, doc_ids)
                query_id_to_header_embeddings[query_id] = header_embeddings
                new_data = True
            header_embeddings = query_id_to_header_embeddings[query_id]
            
            scores = cosine_similarity(query_embeddings, header_embeddings)
            scores = np.reshape(scores, (-1,))
        
        if use_bm25:
            bm_25_scores = []
            for doc_id in doc_ids:
                query = queries[query_id]
                header = docs_headers[doc_id]
                bm25_score = BM25_score(title=header, query=query)
                bm_25_scores.append(bm25_score)
            bm_25_scores = np.array(bm_25_scores)
            th = 0.5
            bm_25_scores = np.where(bm_25_scores > th, 1, 0.1)
            scores = scores * bm_25_scores

        sorted_indices = list(np.argsort(scores)[::-1])
        cur_doc_ids = doc_ids[sorted_indices]

        for doc_id in cur_doc_ids:
            rank_list.append((query_id, doc_id))
        
        if new_data and query_id - last_save_query_id > save_every:
            last_save_query_id = query_id
            print(f'\rCurrent query_id is {query_id}')
            save_model(query_id_to_embeddings, model_name + 'query_id_to_embeddings', copy_to_drive=False)
            save_model(doc_id_to_embeddings, model_name + 'doc_id_to_embeddings', copy_to_drive=False)
            save_model(query_id_to_header_embeddings, model_name + 'query_id_to_header_embeddings', copy_to_drive=False)
            print()
    if new_data or copy_to_drive_after:
        save_model(query_id_to_embeddings, model_name + 'query_id_to_embeddings', copy_to_drive=copy_to_drive_after)
        save_model(doc_id_to_embeddings, model_name + 'doc_id_to_embeddings', copy_to_drive=copy_to_drive_after)
        save_model(query_id_to_header_embeddings, model_name + 'query_id_to_header_embeddings', copy_to_drive=copy_to_drive_after)

    return rank_list

# Загрузка данных

In [None]:
import numpy as np
import pandas as pd

from tqdm.autonotebook import tqdm
from itertools import count
from collections import defaultdict

import pickle
import json

from collections import Counter
from sklearn.metrics.pairwise import cosine_similarity

In [None]:
sample_submission = pd.read_csv('sample.csv')
sample_submission

Unnamed: 0,QueryId,DocumentId
0,0,340485
1,0,68106
2,0,237314
3,0,203791
4,0,53265
...,...,...
403966,6310,295145
403967,6310,257785
403968,6310,151034
403969,6310,551291


In [None]:
queries = pd.read_csv('queries.tsv', sep='\t', header=None, index_col=0)
queries

Unnamed: 0_level_0,1
0,Unnamed: 1_level_1
0,13 причин почему
1,1 положительный и 1 отрицательный могут ли
2,2016 действует ли зао рождественская мануфактура
3,1 месяц после операции на кишечнику диета что ...
4,2 правды 1 ложь что можно придумать
...,...
6306,является ли тойота хариер внедорожником
6307,як можно очистити крейду
6308,являются ли реактивы медицинскими изделиями
6309,являются ли словообразовательными парами слова...


In [None]:
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
stop_words = stopwords.words('russian')
stop_words.extend(['ru', 'сколько', 'почему', 'форум', 'онлайн', 'ответ', 'страница'])

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
!pip3 install pymorphy2[fast]

from pymorphy2 import MorphAnalyzer

morph = MorphAnalyzer()



In [None]:
import re

re_compiled = re.compile(r'[A-Za-zА-Яа-я0-9ßғқβ]+')

In [None]:
my_translate_letter = {
    'ё':'е',
    'і': 'и',
    'є': 'е',
    'ї': 'й',
    'ј': 'й',
    '_': ' ',
    'ѓ': 'г',
    'ґ': 'г',
    'i': 'и',
    'ґ': 'г',
    'ў': 'у',
}

In [None]:
my_translate_word = {
    'rfr gj eltnm': 'как похудеть',
    'd herf': 'в руках',
    'ghbвfn': 'приват',
    'cвeтcкoгo': 'светского',
    'rjвhjв': 'ковров',
    'дол ty': 'должен',
    'чтобы пе ре ве сти зна че ние тем пе ра ту ры по шкале цель сия t c в шкалу фа рен гей та t f поль зу ют ся фор му лой f 1 8c 32 где c гра ду сы цель сия f гра ду сы фа рен гей та какая тем пе ра ту ра по шкале цель сия со от вет ству ет 6 по шкале фа рен гей та ответ округ ли те до де ся тых': 'чтобы перевести значение температуры по шкале цельсия t c в шкалу фаренгейта t f пользуются формулой f 1 8 c 32 где c градусы цельсия f градусы фаренгейта какая температура по шкале цельсия соответствует 6 по шкале фаренгейта ответ округлите до десятых',
    'mjhlt': 'word',
    'jrjвjq gjlвjlrjq': 'боковой подводкой',
    'вjkujuhflt': 'волгоград',
    'ну на ли лиwtypbz hjcnt наlpjhf': 'нужна ли лицензия ростехнадзора',
    'какие о оb gоlо hfnm для tkоuо ку оyyоuо':'какие обои подобрать для белого кухонного',
    'какbt': 'какие',
    'l i b e r t à': 'liberta',
}

In [None]:
# def clear_text_tokens(text):
#     global wrong_set
#     if not isinstance(text, str):
#         text = ' '.join(text)
#     for elem in my_translate_word:
#         text = re.sub(elem, my_translate_word[elem], text)
#     for symb in my_translate_letter:
#         text = re.sub(symb, my_translate_letter[symb], text)
#     text_tokens = re_compiled.findall(text)
#     tokens = []
#     for word in text_tokens:
#         word = morph.normal_forms(word)[0]
#         if word not in stop_words:
#             tokens.append(word)
#     return tokens

In [None]:
# !pip3 install pyaspeller

# from pyaspeller import YandexSpeller

# speller = YandexSpeller()



In [None]:
# queries_tmp = []

# for query in tqdm(queries.values):
#     query = list(query)[0].lower()
#     query = speller.spelled(query)
#     queries_tmp.append(query.split())

# queries = queries_tmp

In [None]:
# save_model(queries, 'queries_after_pyaspeller', copy_to_drive=True)
queries = load_model('queries_after_pyaspeller')

In [None]:
# for i in tqdm(range(len(queries))):
#     queries[i] = clear_text_tokens(queries[i])

In [None]:
# save_model(queries, 'queries_after_pyaspeller_supercleared', copy_to_drive=True)
# queries = load_model('queries_after_pyaspeller_supercleared')





In [None]:
for i in tqdm(range(len(queries))):
    queries[i] = preprocess(' '.join(queries[i]))

HBox(children=(FloatProgress(value=0.0, max=6311.0), HTML(value='')))




In [None]:
# needed_ids = set(sample_submission['DocumentId'].values)
# query_to_docs = {
#     query_id: sample_submission[sample_submission['QueryId'] == query_id]['DocumentId'].values
#     for query_id in range(len(queries))
# }

In [None]:
# save_model(query_to_docs, 'query_to_docs', copy_to_drive=True)
query_to_docs = load_model('query_to_docs')

In [None]:
# docs_headers = {}

# for filename in tqdm(DATA_FILES):
#     with open(DATA_SPLIT_DIRECTORY + filename) as f:
#         for i in count():
#             line = f.readline()
#             if not line:
#                 break
#             line = line.split('\t')
#             doc_id, header = line[0], line[1].lower().split()
#             doc_id = int(doc_id)
#             if doc_id not in needed_ids:
#                 continue
#             docs_headers[doc_id] = header
#             print(f'\r{filename}:\t{i} lines processed', end='')

# # About 20 min

In [None]:
# save_model(docs_headers, 'needed_docs_headers', copy_to_drive=True)
docs_headers = load_model('needed_docs_headers')

In [None]:
# for i in tqdm(docs_headers):
#     docs_headers[i] = clear_text_tokens(docs_headers[i])

HBox(children=(FloatProgress(value=0.0, max=373966.0), HTML(value='')))




In [None]:
# save_model(docs_headers, 'needed_docs_headers_supercleared', copy_to_drive=True)
# docs_headers = load_model('needed_docs_headers_supercleared')

needed_docs_headers_supercleared.pkl saved!
56.90823459625244 MB
Copy done!


In [None]:
for i in tqdm(docs_headers):
    docs_headers[i] = preprocess(' '.join(docs_headers[i]))

HBox(children=(FloatProgress(value=0.0, max=373966.0), HTML(value='')))




# BM 25

In [None]:
words_all = []
words_counter = Counter()

for query in tqdm(queries):
    words_all.extend(query)
    words_counter.update(query)

for doc_id in tqdm(docs_headers):
    words_all.extend(docs_headers[doc_id])
    words_counter.update(docs_headers[doc_id])

words_all = list(set(words_all))

words_counter.most_common(10)

HBox(children=(FloatProgress(value=0.0, max=6311.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=373966.0), HTML(value='')))




[('один', 72376),
 ('ноль', 68646),
 ('два', 61941),
 ('ru', 40937),
 ('три', 25313),
 ('форум', 23831),
 ('пять', 22693),
 ('четыре', 21837),
 ('онлайн', 21018),
 ('шесть', 19708)]

In [None]:
words_count = defaultdict(float)

for doc_id in tqdm(docs_headers):
    for word in set(docs_headers[doc_id]):
        words_count[word] += 1

words_idf = {word: 1.0 for word in words_all}
words_idf.update({word: 1 / word_count for word, word_count in words_count.items()})

HBox(children=(FloatProgress(value=0.0, max=373966.0), HTML(value='')))




In [None]:
avg_title_len = np.mean([len(docs_headers[doc_id]) for doc_id in docs_headers])
avg_title_len

7.418759994224074

In [None]:
def BM25_score(title, query, k=2, b=0.75):
    bm25_score = 0.0

    query_tf = defaultdict(float)
    for word in query:
        query_tf[word] = title.count(word)
    
    for word in query:
        bm25_score += words_idf[word] * ((query_tf[word] * (k + 1)) / (query_tf[word] + k * (1 - b + b * (len(title) / avg_title_len))))
    
    return bm25_score

# Google

In [None]:
!pip3 install tensorflow_text

import tensorflow as tf
import tensorflow_text
import tensorflow_hub as hub

module = hub.load('https://tfhub.dev/google/universal-sentence-encoder-multilingual-qa/3')



In [None]:
model_name = 'google_supercleared'

In [None]:
def get_emb_func_corpus(texts, query_id, doc_ids):
    return module.signatures['response_encoder'](
                    input=tf.constant(texts),
                    context=tf.constant(texts)
                )['outputs'].numpy()

In [None]:
def get_emb_func(text, doc_id=None, query_id=None):
    if query_id is not None:
        return module.signatures['question_encoder'](tf.constant([text]))['outputs'].numpy()
    else:
        return module.signatures['response_encoder'](
                        input=tf.constant([text]),
                        context=tf.constant([text])
                    )['outputs'].numpy()

In [None]:
query_id_to_embeddings = {}
query_id_to_header_embeddings = {}
doc_id_to_embeddings = {}

In [None]:
query_id_to_embeddings = load_model(model_name + '_' + 'query_id_to_embeddings')
query_id_to_header_embeddings = load_model(model_name + '_' + 'query_id_to_header_embeddings')
doc_id_to_embeddings = load_model(model_name + '_' + 'doc_id_to_embeddings')

In [None]:
rank_list = make_rank_from_embeddings(get_emb_func, get_emb_func_corpus, model_name=model_name, copy_to_drive_after=True)

HBox(children=(FloatProgress(value=0.0, max=6311.0), HTML(value='')))

Current query_id is 205
google_supercleared_query_id_to_embeddings.pkl saved!
423.9228515625 KB
google_supercleared_doc_id_to_embeddings.pkl saved!
6 B
google_supercleared_query_id_to_header_embeddings.pkl saved!
23.499869346618652 MB

Current query_id is 307
google_supercleared_query_id_to_embeddings.pkl saved!
634.4482421875 KB
google_supercleared_doc_id_to_embeddings.pkl saved!
164.5029296875 KB
google_supercleared_query_id_to_header_embeddings.pkl saved!
33.84346008300781 MB

Current query_id is 408
google_supercleared_query_id_to_embeddings.pkl saved!
842.9580078125 KB
google_supercleared_doc_id_to_embeddings.pkl saved!
164.5029296875 KB
google_supercleared_query_id_to_header_embeddings.pkl saved!
43.35876655578613 MB

Current query_id is 510
google_supercleared_query_id_to_embeddings.pkl saved!
1.0288400650024414 MB
google_supercleared_doc_id_to_embeddings.pkl saved!
164.5029296875 KB
google_supercleared_query_id_to_header_embeddings.pkl saved!
55.14228057861328 MB

Current quer

# Google + BM25

In [None]:
model_name = 'google'

In [None]:
query_id_to_embeddings = load_model(model_name + '_' + 'query_id_to_embeddings')
query_id_to_header_embeddings = load_model(model_name + '_' + 'query_id_to_header_embeddings')
doc_id_to_embeddings = load_model(model_name + '_' + 'doc_id_to_embeddings')

In [None]:
rank_list = make_rank_from_embeddings(get_emb_func, get_emb_func_corpus, model_name=model_name, copy_to_drive_after=False, use_bm25=True)

HBox(children=(FloatProgress(value=0.0, max=6311.0), HTML(value='')))




# Fasttext

In [None]:
!pip3 install fasttext

import fasttext

Collecting fasttext
[?25l  Downloading https://files.pythonhosted.org/packages/f8/85/e2b368ab6d3528827b147fdb814f8189acc981a4bc2f99ab894650e05c40/fasttext-0.9.2.tar.gz (68kB)
[K     |████▊                           | 10kB 13.7MB/s eta 0:00:01[K     |█████████▌                      | 20kB 17.0MB/s eta 0:00:01[K     |██████████████▎                 | 30kB 9.5MB/s eta 0:00:01[K     |███████████████████             | 40kB 9.7MB/s eta 0:00:01[K     |███████████████████████▉        | 51kB 5.3MB/s eta 0:00:01[K     |████████████████████████████▋   | 61kB 5.2MB/s eta 0:00:01[K     |████████████████████████████████| 71kB 3.5MB/s 
Building wheels for collected packages: fasttext
  Building wheel for fasttext (setup.py) ... [?25l[?25hdone
  Created wheel for fasttext: filename=fasttext-0.9.2-cp37-cp37m-linux_x86_64.whl size=3098485 sha256=fbe8aa42a70080d1ade3a017f1e2b732fee492896e36d7b61c920e4053ae7576
  Stored in directory: /root/.cache/pip/wheels/98/ba/7f/b154944a1cf5a8cee91c15

In [None]:
fasttext_name = 'cc.ru.300.bin'
if not os.path.exists(fasttext_name):
    !cp $DATA_DIRECTORY/$fasttext_name ./$fasttext_name
else:
    print('No copy')

In [None]:
# import fasttext.util
# fasttext.util.download_model('ru', if_exists='ignore') 

In [None]:
ft = fasttext.load_model('cc.ru.300.bin')



In [None]:
model_name = 'fasttext'

In [None]:
query_id_to_embeddings = {}
query_id_to_header_embeddings = {}
doc_id_to_embeddings = {}

In [None]:
def get_emb_func(text, doc_id=None, query_id=None):
    return ft.get_sentence_vector(text).reshape((-1, 300))

In [None]:
def get_emb_func_corpus(texts, query_id, doc_ids):
    res = []
    for i, doc_id in enumerate(doc_ids):
        if doc_id in doc_id_to_embeddings:
            res.append(doc_id_to_embeddings[doc_id])
        else:
            res.append(get_emb_func(texts[i], doc_id=doc_id))
    res = np.array(res).reshape(-1, 300)
    return res

In [1]:
rank_list = make_rank_from_embeddings(get_emb_func, get_emb_func_corpus, model_name=model_name, copy_to_drive_after=False, use_bm25=True)

# Google + ft

In [None]:
query_id_to_embeddings = {}
doc_id_to_embeddings = {}
query_id_to_header_embeddings = {}

In [None]:
model_name = 'fasttext_google'

In [None]:
query_id_to_embeddings_fasttext = load_model('fasttext_query_id_to_embeddings')
doc_id_to_embeddings_fasttext = load_model('fasttext_doc_id_to_embeddings')
query_id_to_header_embeddings_fasttext = load_model('fasttext_query_id_to_header_embeddings')

In [None]:
query_id_to_embeddings_google = load_model('google_query_id_to_embeddings')
query_id_to_header_embeddings_google = load_model('google_query_id_to_header_embeddings')
doc_id_to_embeddings_google = load_model('google_doc_id_to_embeddings')

In [None]:
def get_emb_func_corpus(texts, query_id, doc_ids):
    return np.hstack((
        query_id_to_header_embeddings_google[query_id],
        query_id_to_header_embeddings_fasttext[query_id]
    ))

In [None]:
def get_emb_func(text, doc_id=None, query_id=None):
    if query_id is not None:
        if query_id not in query_id_to_embeddings_google:
            return np.array([])
        return np.hstack((
            query_id_to_embeddings_google[query_id],
            query_id_to_embeddings_fasttext[query_id]
        ))
    else:
        return np.hstack((
            doc_id_to_embeddings_google[doc_id],
            doc_id_to_embeddings_fasttext[doc_id]
        ))

In [None]:
query_id_to_embeddings = {}
doc_id_to_embeddings = {}
query_id_to_header_embeddings = {}

In [None]:
rank_list = make_rank_from_embeddings(get_emb_func, get_emb_func_corpus, model_name=model_name, copy_to_drive_after=True)

HBox(children=(FloatProgress(value=0.0, max=6311.0), HTML(value='')))

Current query_id is 0
fasttext_google_query_id_to_embeddings.pkl saved!
3.3330078125 KB
fasttext_google_doc_id_to_embeddings.pkl saved!
6 B
fasttext_google_query_id_to_header_embeddings.pkl saved!
533.0361328125 KB

Current query_id is 101
fasttext_google_query_id_to_embeddings.pkl saved!
329.10546875 KB
fasttext_google_doc_id_to_embeddings.pkl saved!
6 B
fasttext_google_query_id_to_header_embeddings.pkl saved!
18.347030639648438 MB

Current query_id is 204
fasttext_google_query_id_to_embeddings.pkl saved!
662.044921875 KB
fasttext_google_doc_id_to_embeddings.pkl saved!
6 B
fasttext_google_query_id_to_header_embeddings.pkl saved!
37.02310562133789 MB

Current query_id is 307
fasttext_google_query_id_to_embeddings.pkl saved!
995.03515625 KB
fasttext_google_doc_id_to_embeddings.pkl saved!
257.0810546875 KB
fasttext_google_query_id_to_header_embeddings.pkl saved!
53.666693687438965 MB

Current query_id is 408
fasttext_google_query_id_to_embeddings.pkl saved!
1.290633201599121 MB
fasttex

# Submission

In [None]:
submission = pd.DataFrame(rank_list, columns=sample_submission.columns)
submission[submission.QueryId == 0]

Unnamed: 0,QueryId,DocumentId
0,0,563677
1,0,461894
2,0,49847
3,0,64459
4,0,113720
...,...,...
163,0,566487
164,0,107045
165,0,564616
166,0,515943


In [None]:
name = make_submission(submission)

In [None]:
name

'Mon May 31 22:23:42 2021_submission.csv'