In [51]:
%config IPCompleter.greedy=True
import re
import json
from collections import defaultdict
from tqdm import tqdm_notebook as tqdm
from elasticsearch import Elasticsearch
from elasticsearch.helpers import parallel_bulk
from pymystem3 import Mystem
from sklearn.feature_extraction.text import CountVectorizer
import requests
from time import time
import bs4
import base64
from tqdm import tqdm_notebook
import numpy as np

In [52]:
es = Elasticsearch([{'host': 'localhost', 'port': 9200, 'timeout': 360, 'maxsize': 25}])

In [53]:
raw_index_name="raw_index"

In [54]:
def recreate_index(name, settings):
    if es.indices.exists(index=name):
        es.indices.delete(index=name)
    es.indices.create(index=name, body=settings)
    
def check_analyzer(raw_index_name, analyzer, text):
    body = analyzer
    body['text'] = text
    
    tokens = es.indices.analyze(index=raw_index_name, body=body)['tokens']
    tokens = [token_info['token'] for token_info in tokens]
    return tokens    

In [55]:
settings = {
    'mappings': {
        'properties': {
            'id': {
                'type': 'integer'
            },
            'url': {
                'type': 'text'
            },
            'content': {
                'type': 'text',
                'fields': {
                    'complex': {
                        'type': 'text',
                        'analyzer': 'russian_complex'
                    }
                }
            }
        }
    },
    'settings': {
        'analysis': {
            'analyzer': {
                'russian_complex': {
                    'char_filter': [
                        'no_html'
                    ],
                    'tokenizer': 'word_longer_2',
                    'filter': [
                        'lowercase',
                        'russian_snow'
                    ]
                },
            },
            'char_filter': {
                'no_html': {
                    'type': 'html_strip',
                    "escaped_tags": []
                }
            },
            'tokenizer': {
                'word_longer_2': {
                    'type': 'pattern',
                    'pattern': '[a-zA-Z_0-9\u0400-\u04FF]{2,}',
                    'group': 0
                }
            },
            'filter': {
                'russian_snow': {
                    'type': 'snowball',
                    'language': 'russian'
                }    
            }
        }
    }
}

In [56]:
recreate_index(raw_index_name, settings)

In [57]:
text = '<div><a>Живи, Мое имя!! Fuck YOU!!</a></div>'

analyzer = {
    'char_filter': ['no_html'],
    'tokenizer': 'letter',
    'filter': ['lowercase']
}

check_analyzer(raw_index_name, analyzer, text)

['живи', 'мое', 'имя', 'fuck', 'you']

In [58]:
def create_es_action(index, doc_id, document):
    return {
        '_index': index,
        '_id': doc_id,
        '_source': document
    }

In [59]:
def es_actions_generator(index_name):
    index = 0
    for doc_id in range(3):
        with open(f'../byweb_for_course/byweb.{doc_id}.xml', 'r') as inf:
            inf.readline()
            for i in tqdm(range(20000)):
                index = index + 1
                a = inf.readline()
                b = inf.readline()
                c = inf.readline()
                page = a + b + c
                doc = {}
                document = bs4.BeautifulSoup(page, "lxml")
                doc['id'] = document.docid.contents[0]
                doc['url'] = base64.b64decode(document.docurl.contents[0]).decode("cp1251")
                doc['content'] = base64.b64decode(document.content.contents[0]).decode("cp1251")
                yield create_es_action(index_name, index, doc)

In [60]:
def run_generator(index_name):
    for ok, result in parallel_bulk(es, es_actions_generator(index_name), queue_size=4, thread_count=4, chunk_size=1000):
        if not ok:
            print('lol')

In [61]:
recreate_index(raw_index_name, settings)
start = time()
run_generator(raw_index_name)
end = time()
print(end - start)








320.0822591781616


In [12]:
def search(index, query, *args):
    pretty_print_result(es.search(index=index, body=query, size=20), args)
    # note that size set to 20 just because default value is 10 and we know that we have 12 docs and 10 < 12 < 20
                        
def pretty_print_result(search_result, fields=[]):
    # fields is a list of fields names which we want to be printed
    res = search_result['hits']
    print(f'Total documents: {res["total"]["value"]}')
    for hit in res['hits']:
        print(f'Doc {hit["_id"]}, score is {hit["_score"]}')
        for field in fields:
            print(f'{field}: {hit["_source"][field]}')
                  
def get_doc_by_id(index, doc_id):
    return es.get(index=index, id=doc_id)['_source']

In [13]:
inf = open('../web2008_adhoc.xml', 'r', encoding='cp1251')
for i in range(9):
    inf.readline()
    
queries = dict()
for s in inf.readlines()[:-1]:
    query_text = re.split('<.*?>', s)[2]
    query_id = s[10:18]
    queries[query_id] = query_text

In [14]:
inf = open('../relevant_table_2009.xml')
s = '\n'.join(inf.readlines())
soup = bs4.BeautifulSoup(s, 'lxml')

In [27]:
def search_docid(index, query_id):
    query = {
        'query': {
            'bool': {
                'must': {
                    'match': {
                        'content': queries[query_id]
                    }
                }
            }
        }
    }
    res = es.search(index=index, body=query, size=300)
    hits = res['hits']['hits']
    return [doc['_source']['id'] for doc in hits]
    
def precision(k, ids, rels):
    ids = ids[:k]
    if len(ids) == 0:
        return 0
    top = 0
    good = 0
    for i in ids:
        if i in rels:
            top += 1
            good += rels[i]
    if top == 0:
        return 0
    return good / top

def recall(k, ids, rels):
    ids = ids[:k]
    if len(ids) == 0:
        return 0
    good = 0
    num_good = np.sum(list(rels.values()))
    relev = min(num_good, k)
    for i in ids:
        if i in rels:
            good += rels[i]
    if relev == 0:
        return 0
    return good / relev

def mean_ap(k, ids, rels):
    ids = ids[:k]
    ps = []
    for i in ids:
        if i in rels:
            ps.append(rels[i])
    if np.sum(ps) == 0:
        return 0
    sums = np.cumsum(ps) / (np.array(range(len(ps))) + 1)
    return np.sum(sums * ps) / np.sum(ps)
    
def rprecision(ids, rels):
    num_good = np.sum(list(rels.values()))
    return precision(num_good, ids, rels)

In [31]:
p20 = []
r20 = []
m_ap = []
rp = []
for task in soup.html.body.taskdocumentmatrix.children:
    if not isinstance(task, bs4.element.Tag):
        continue
    query_id = task['id']
    ids = search_docid('raw_index', query_id)
    rels = dict()
    for doc in task.children:
        if not isinstance(doc, bs4.element.Tag):
            continue
        doc_id = doc['id']
        rel = 1 if (doc['relevance'] == 'vital') else 0
        rels[doc_id] = rel
    rel_ids = [-1 if i not in rels else rels[i] for i in ids]
    p20.append(precision(20, ids, rels))
    r20.append(recall(20, ids, rels))
    m_ap.append(mean_ap(20, ids, rels))
    rp.append(rprecision(ids, rels))
    
    
print(np.average(p20))
print(np.average(r20))
print(np.average(m_ap))
print(np.average(rp))

KeyboardInterrupt: 

In [40]:
start = time()
it = 0
qids = list(queries.keys())[:200]
for que in qids:
    it += 1
    search_docid('raw_index', que)
finish = time()
print((finish - start) / 200)

0.24428794145584107


In [49]:
es.indices.stats('raw_index')['_all']['primaries']['store']['size_in_bytes']

1661894042