In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
DIRECTORY = 'drive/MyDrive/Informatics/Sphere@mail.ru/IR/hw_05/'

In [None]:
%%time
!cp $DIRECTORY/clicks_test_.tsv .
!cp $DIRECTORY/*.pkl .

CPU times: user 374 ms, sys: 79.9 ms, total: 454 ms
Wall time: 50.4 s


In [None]:
!ls -lh

total 1.9G
-rw------- 1 root root  91M May 31 16:27 clicks_test_.tsv
drwx------ 6 root root 4.0K May 31 16:27 drive
-rw------- 1 root root 902M May 31 16:28 one_header_embeddings_from_id.pkl
-rw------- 1 root root 902M May 31 16:28 query_embeddings_from_id.pkl
drwxr-xr-x 1 root root 4.0K May  6 13:44 sample_data


In [None]:
import pickle
import os

def save_model(model, filename, copy_to_drive=False):
    if not filename.endswith('.pkl'):
        filename = filename + '.pkl'
    with open(filename, 'wb') as f:
        pickle.dump(model, f) 
    byte_size = os.stat(filename).st_size
    for name in ['B', 'KB', 'MB', 'GB']:
        if byte_size > 1024:
            byte_size /= 1024
        else:
            break
    print(filename, 'saved!')
    print(byte_size, name)
    if copy_to_drive:
        !cp $filename $DIRECTORY/$filename
        print('Copy done!')


def load_model(filename):
    if not filename.endswith('.pkl'):
        filename = filename + '.pkl'
    with open(filename, 'rb') as f:
        model = pickle.load(f) 
    return model

In [None]:
!pip3 install tensorflow_text

import tensorflow as tf
import tensorflow_text
import tensorflow_hub as hub

module = hub.load('https://tfhub.dev/google/universal-sentence-encoder-multilingual-qa/3')

Collecting tensorflow_text
[?25l  Downloading https://files.pythonhosted.org/packages/c0/ed/bbb51e9eccca0c2bfdf9df66e54cdff563b6f32daed9255da9b9a541368f/tensorflow_text-2.5.0-cp37-cp37m-manylinux1_x86_64.whl (4.3MB)
[K     |████████████████████████████████| 4.3MB 4.9MB/s 
Installing collected packages: tensorflow-text
Successfully installed tensorflow-text-2.5.0


In [None]:
import numpy as np
import pandas as pd

from tqdm.autonotebook import tqdm

  after removing the cwd from sys.path.


In [None]:
clicks_test = pd.read_csv('clicks_test_.tsv', sep='\t', header=None, index_col=0)
clicks_test.head()

Unnamed: 0_level_0,1,2
0,Unnamed: 1_level_1,Unnamed: 2_level_1
0,2018 01 МОСКВА КОНЦЕРТ,SEXZIMA ПАРЕНЬ МОЛОДОЙ ПИЗДА РУССКАЯ ЛИЗАТЬ РУ...
1,ВЫБОРЫ ВЫБОР ПРИКОЛ,ВЫБОРЫ ВЫБОР ПРИКОЛ
2,ANAL IZMENA IZMENON PORN,ФОТКА АНАЛ ИЗМЕНА ПОРНО ЖЕНА МУЖ
3,ПЛОСКОСТЬ НАКЛОННЫЙ ПОДЪЕСТЬ ВВЕРХ ПОДЪЕМ КОЛЕСО,ДАВИНЧАТЬ ДАВИНЧИ ДАВИНЧ ДОСТАВКА
4,ВИКИПЕДИЯ РИББЕНТРОП,WIKIPEDIA ВІКІПЕДІЯ ВИКИПЕДИЯ РИББЕНТРОП


In [None]:
query = clicks_test.iloc[0][1].split()
titles = [clicks_test.iloc[0][2].split(), clicks_test.iloc[1][2].split()]

In [None]:
!pip3 install pyaspeller

from pyaspeller import YandexSpeller

speller = YandexSpeller()

Collecting pyaspeller
  Downloading https://files.pythonhosted.org/packages/96/6f/d48b211bf3c77c490707679a2e502d2686e4faa15ecca7cb5d3264a89fc5/pyaspeller-0.2.0-py2.py3-none-any.whl
Installing collected packages: pyaspeller
Successfully installed pyaspeller-0.2.0


In [None]:
from collections import defaultdict

In [None]:
def clean_text(text):
    return text

In [None]:
words_all = []
id_to_query = []
id_to_title = []

for query, title in tqdm(clicks_test.values):
    query = query.lower()
    title = title.lower()
    cleaned_query = clean_text(query)
    cleaned_title = clean_text(title)
    words_all.extend(cleaned_query.split())
    words_all.extend(cleaned_title.split())
    id_to_query.append(cleaned_query)
    id_to_title.append(cleaned_title)

words_all = list(set(words_all))




HBox(children=(FloatProgress(value=0.0, max=500000.0), HTML(value='')))




In [None]:
words_count = defaultdict(float)

for word in tqdm(id_to_title):
    words_count[word] += 1

words_idf = {word: 1.0 for word in words_all}
words_idf.update({word: 1 / word_count for word, word_count in words_count.items()})

HBox(children=(FloatProgress(value=0.0, max=500000.0), HTML(value='')))




In [None]:
avg_title_len = np.mean([len(title) for title in id_to_title])
avg_title_len

64.430586

In [None]:
def BM25_score(title, query, k=2, b=0.75):
    bm25_score = 0.0

    query_tf = defaultdict(float)
    for word in query:
        query_tf[word] = title.count(word)
    
    for word in query:
        bm25_score += words_idf[word] * ((query_tf[word] * (k + 1)) / (query_tf[word] + k * (1 - b + b * (len(title) / avg_title_len))))
    
    return bm25_score

In [None]:
from sklearn.metrics.pairwise import cosine_similarity

In [None]:
query_embeddings_from_id = {}
one_header_embeddings_from_id = {}

In [None]:
query_embeddings_from_id = load_model('query_embeddings_from_id')
one_header_embeddings_from_id = load_model('one_header_embeddings_from_id')

In [None]:
scores = []
scores_bm25 = []
last_save_query_id = -1000

for query_id in tqdm(range(len(id_to_query))):
    
    new_data = False
    query = id_to_query[query_id]
    
    if query_id not in query_embeddings_from_id:
        query_embeddings = module.signatures['question_encoder'](tf.constant([query]))['outputs'].numpy()
        query_embeddings_from_id[query_id] = query_embeddings
        new_data = True
    query_embeddings = query_embeddings_from_id[query_id]

    title = id_to_title[query_id]

    if query_id not in one_header_embeddings_from_id:
        header_embeddings = module.signatures['response_encoder'](
                input=tf.constant([title]),
                context=tf.constant([title])
            )['outputs'].numpy()
        one_header_embeddings_from_id[query_id] = header_embeddings
        new_data = True
    header_embeddings = one_header_embeddings_from_id[query_id]
    score = cosine_similarity(query_embeddings, header_embeddings)[0][0]
    scores.append(score)
    scores_bm25.append(BM25_score(title=title.split(), query=query.split()))
    
    if new_data and query_id - last_save_query_id > 1000:
        last_save_query_id = query_id
        print(f'\rCurrent query_id is {query_id}')
        save_model(query_embeddings_from_id, 'query_embeddings_from_id', copy_to_drive=False)
        save_model(one_header_embeddings_from_id, 'one_header_embeddings_from_id', copy_to_drive=False)
        print()

HBox(children=(FloatProgress(value=0.0, max=500000.0), HTML(value='')))




In [None]:
# save_model(query_embeddings_from_id, 'query_embeddings_from_id', copy_to_drive=True)
# save_model(one_header_embeddings_from_id, 'one_header_embeddings_from_id', copy_to_drive=True)

query_embeddings_from_id.pkl saved!
1008.1268606185913 MB
Copy done!
one_header_embeddings_from_id.pkl saved!
1008.1268606185913 MB
Copy done!


In [None]:
scores_np = np.array(scores_bm25)
scores_np = np.where(scores_np > 0.5, 1, 0)

In [None]:
scores_np[scores_np == 1].size

236442

In [None]:
from itertools import zip_longest
data = zip_longest(range(len(id_to_query)), scores_np, fillvalue=0)

In [None]:
submission = pd.DataFrame(data=data, columns=['Id', 'Predicted'])
submission

Unnamed: 0,Id,Predicted
0,0,0
1,1,1
2,2,0
3,3,0
4,4,1
...,...,...
499995,499995,0
499996,499996,0
499997,499997,1
499998,499998,1


In [None]:
import time
def make_submission(submission):
    name = time.asctime() + '_submission.csv'
    submission.to_csv(name, index=False)
    return name

In [None]:
make_submission(submission)

'Mon May 31 20:42:59 2021_submission.csv'