In [1]:
import pandas as pd
import numpy as np
#import pymorphy2
#from collections import Counter, defaultdict
import utils
import sys
from bm25 import BM25

%load_ext autoreload
%autoreload 2

In [17]:
queries = utils.load_queries('data/queries_tr.tsv')
test_data = utils.load_test('data/test.tsv')
train_data = utils.load_train('data/clicks.train.tsv')

In [3]:
bigrams = set()
for query in queries.values():
    terms = query.strip().split(' ')
    for i in range(0,len(terms)-1):
        bigrams.add(terms[i]+' '+terms[i+1])

In [4]:
fd = open('data/docs.tsv', "rt")
index = utils.load_from_file("index.pkl")

In [5]:
bm25_model_bigrams = BM25('data/docs.tsv', norm_method = 'word_bigrams', debug=1)

In [6]:
bm25_model_bigrams.bigram_filter = bigrams

In [7]:
bm25_model_bigrams.build()

Processed documents 394300

In [8]:
utils.save_to_file(bm25_model_bigrams, "bm25_bigrams.pkl")

In [9]:
def score(query, doc_id, smooth_idf=False):
    title, body = utils.get_document(fd, index, doc_id)
    return bm25_model_bigrams.score_bm25(query, title, body, smooth_idf=smooth_idf)

In [10]:
results = []
for query_id, docs in test_data:
    query = queries[query_id]
    for doc_id in docs:
        title_score, body_score = score(query, doc_id)
        results.append((query_id, doc_id, title_score, body_score))

In [11]:
df = pd.DataFrame(results, columns=['query_id', 'doc_id', 'score_title_bm25_bigram', 'score_body_bm25_bigram'])

In [12]:
utils.save_to_file(df, "df_bm25_bigram_scores.pkl")

In [13]:
results_smooth = []
for query_id, docs in test_data:
    query = queries[query_id]
    for doc_id in docs:
        title_score, body_score = score(query, doc_id, smooth_idf=True)
        results_smooth.append((query_id, doc_id, title_score, body_score))

In [14]:
df = pd.DataFrame(results_smooth, columns=['query_id', 'doc_id', 'score_title_bm25_bigram_smooth', 'score_body_bm25_bigram_smooth'])

In [15]:
utils.save_to_file(df, "df_bm25_bigram_scores_smooth.pkl")

## Train data

In [None]:
results = []
for query_id, docs, clicks in train_data:
    query = queries[query_id]
    for doc_id in docs:
        title_score, body_score = score(query, doc_id)
        results.append((query_id, doc_id, title_score, body_score))

In [None]:
df = pd.DataFrame(results, columns=['query_id', 'doc_id', 'score_title_bm25_bigram', 'score_body_bm25_bigram'])

In [None]:
utils.save_to_file(df, "df_train_bm25_bigram_scores.pkl")

In [None]:
results_smooth = []
for query_id, docs, clicks in train_data:
    query = queries[query_id]
    for doc_id in docs:
        title_score, body_score = score(query, doc_id, smooth_idf=True)
        results_smooth.append((query_id, doc_id, title_score, body_score))

In [None]:
df = pd.DataFrame(results_smooth, columns=['query_id', 'doc_id', 'score_title_bm25_bigram_smooth', 'score_body_bm25_bigram_smooth'])

In [None]:
utils.save_to_file(df, "df_train_bm25_bigram_scores_smooth.pkl")