In [None]:
import pandas as pd
import numpy as np
#import pymorphy2
from collections import Counter, defaultdict
import utils
import sys
from bm25 import BM25

%load_ext autoreload
%autoreload 2

In [None]:
import bm25
import importlib
bm25 = importlib.reload(bm25)
BM25 = bm25.BM25

In [None]:
bm25_test = BM25('data/docs_test.tsv', norm_method = 'none', debug=2, save_normalized='data/docs_normalized.tsv')

In [None]:
bm25_test.build()

In [None]:
fd = open('data/docs.tsv', "rt")
index = utils.load_from_file("index.pkl")

In [None]:
bm25_model = BM25('data/docs.tsv', norm_method = 'none', debug=1)

In [None]:
bm25_mode = utils.load_from_file('bm25_unigram.pkl')

In [None]:
bm25_model.build()

In [None]:
utils.save_to_file(bm25_model, "bm25_unigram.pkl")

In [None]:
queries = utils.load_queries('data/queries_tr.tsv')
test_data = utils.load_test('data/test.tsv')
train_data = utils.load_train('data/clicks.train.tsv')

In [None]:
def score(query, doc_id, smooth_idf=False):
    title, body = utils.get_document(fd, index, doc_id)
    return bm25_model.score_bm25(query, title, body, smooth_idf=smooth_idf)

In [None]:
results = []
for query_id, docs in test_data:
    query = queries[query_id]
    for doc_id in docs:
        title_score, body_score = score(query, doc_id)
        results.append((query_id, doc_id, title_score, body_score))

In [None]:
df = pd.DataFrame(results, columns=['query_id', 'doc_id', 'score_title_bm25', 'score_body_bm25'])

In [None]:
utils.save_to_file(df, "df_bm25_unigram_scores.pkl")

In [None]:
results = []
for query_id, docs in test_data:
    query = queries[query_id]
    for doc_id in docs:
        title_score, body_score = score(query, doc_id, smooth_idf=True)
        results.append((query_id, doc_id, title_score, body_score))

In [None]:
df = pd.DataFrame(results, columns=['query_id', 'doc_id', 'score_title_bm25_smooth', 'score_body_bm25_smooth'])

In [None]:
utils.save_to_file(df, "df_bm25_unigram_scores_smooth.pkl")

In [None]:
results = []
for query_id, docs, clicks in train_data:
    query = queries[query_id]
    for doc_id in docs:
        title_score, body_score = score(query, doc_id)
        results.append((query_id, doc_id, title_score, body_score))

In [None]:
df = pd.DataFrame(results, columns=['query_id', 'doc_id', 'score_title_bm25', 'score_body_bm25'])

In [None]:
utils.save_to_file(df, "df_train_bm25_unigram_scores.pkl")

In [None]:
results = []
for query_id, docs, clicks in train_data:
    query = queries[query_id]
    for doc_id in docs:
        title_score, body_score = score(query, doc_id)
        results.append((query_id, doc_id, title_score, body_score))

In [None]:
df = pd.DataFrame(results, columns=['query_id', 'doc_id', 'score_title_bm25_smooth', 'score_body_bm25_smooth'])

In [None]:
utils.save_to_file(df, "df_train_bm25_unigram_scores_smooth.pkl")

In [None]:
def get_tf_idf(query, doc_id, model):
    title, body = utils.get_document(fd, index, doc_id)
    
    query_terms = set(query.strip().split(' '))
    title_terms = Counter(title.strip().split(' '))
    body_terms = Counter(body.strip().split(' '))
    whole_terms = title_terms + body_terms
    
    def get_tfs(query_terms, doc_terms):
        intersection = query_terms.intersection(set(doc_terms.keys()))
        tfs = {}
        for term in intersection:
            tfs[term] = doc_terms[term]
        return tfs
            
    def get_tfs_counts(tfs):
        values = list(tfs.values())
        sum_tfs = np.sum(values)
        min_tf = np.min(values)
        max_tf = np.max(values)
        mean_tf = np.mean(values)
        std_tf = np.std(values)
        
        return sum_tfs, min_tf, max_tf, mean_tf, std_tf
        
    def get_tfs_normalized(tfs, term_count):
        tfs = np.array(list(tfs.values())) / term_count
        
        sum_tfs = np.sum(tfs)
        min_tf = np.min(tfs)
        max_tf = np.max(tfs)
        mean_tf = np.mean(tfs)
        std_tf = np.std(tfs)
        
        return sum_tfs, min_tf, max_tf, mean_tf, std_tf
    
    def get_tf_idf(tfs, term_count, terms_docs):
        for x in list(tfs.keys()):
            tfs[x] = tfs[x]/term_count
            
        def idf(term, tf):
            if not term in terms_docs:
                return 0
            return tf*model.idf2(len(terms_docs[term]))
        
        tfidfs = []
        for term, tf in tfs.items():
            tfidfs.append(idf(term,tf))
            
        sum_tfidfs = np.sum(tfidfs)
        min_tfidf = np.min(tfidfs)
        max_tfidf = np.max(tfidfs)
        mean_tfidf = np.mean(tfidfs)
        std_tfidf = np.std(tfidfs)
                          
        return  sum_tfidfs, min_tfidf, max_tfidf, mean_tfidf, std_tfidf
    
    title_terms_count = sum(list(title_terms.values()))
    body_terms_count = sum(list(body_terms.values()))
    whole_terms_count = sum(list(whole_terms.values()))
    
    title_term_docs = model.term_to_title
    body_term_docs = model.term_to_body
    whole_term_docs = title_term_docs.copy()
    whole_term_docs.update(body_term_docs)
    for term in whole_term_docs.keys():
        whole_term_docs[term] = list(set(whole_term_docs[term]))
    
    title_tfs = get_tfs(query_terms, title_terms)
    title_tfs_counts = get_tfs_counts(title_tfs)
    title_tfs_normalized = get_tfs_normalized(title_tfs, title_terms_count)
    title_tf_idfs = get_tf_idf(title_tfs, title_terms_count, title_term_docs)
    
    body_tfs = get_tfs(query_terms, body_terms)
    body_tfs_counts = get_tfs_counts(body_tfs)
    body_tfs_normalized = get_tfs_normalized(body_tfs, body_terms_count)
    body_tf_idfs = get_tf_idf(body_tfs, body_terms_count, body_term_docs)
    
    whole_tfs = get_tfs(query_terms, whole_terms)
    whole_tfs_counts = get_tfs_counts(whole_tfs)
    whole_tfs_normalized = get_tfs_normalized(whole_tfs, whole_terms_count)
    whole_tf_idfs = get_tf_idf(whole_tfs, whole_terms_count, whole_term_docs)
    
    return (title_tfs, title_tfs_counts, title_tfs_normalized, title_tf_idfs, body_tfs, body_tfs_counts,
            body_tfs_normalized, body_tf_idfs, whole_tfs, whole_tfs_counts, whole_tfs_normalized, whole_tf_idfs)

In [None]:
test_test_data = test_data[0:10]

In [None]:
results = []
for query_id, docs in test_test_data:
    query = queries[query_id]
    for doc_id in docs:
        scores = get_tf_idf(query, doc_id, bm25_model)
        temp = [query_id, doc_id]
        for x in scores:
            for y in x:
                temp.append(y)
        results.append(temp)

In [None]:
test_