In [59]:
%config IPCompleter.greedy=True
import re
import json
from collections import defaultdict
from tqdm import tqdm_notebook as tqdm
from elasticsearch import Elasticsearch
from elasticsearch.helpers import parallel_bulk
from pymystem3 import Mystem
from sklearn.feature_extraction.text import CountVectorizer
import requests
from time import time
import bs4
import base64
from tqdm import tqdm_notebook
import numpy as np
import locale
import pymystem3 as ms
from nltk.stem import WordNetLemmatizer
from nltk import word_tokenize, pos_tag

In [34]:
es = Elasticsearch([{'host': 'localhost', 'port': 9200, 'timeout': 360, 'maxsize': 25}])

In [35]:
raw_index_name="raw_index"

In [36]:
def recreate_index(name, settings):
    if es.indices.exists(index=name):
        es.indices.delete(index=name)
    es.indices.create(index=name, body=settings)
    
def check_analyzer(raw_index_name, analyzer, text):
    body = analyzer
    body['text'] = text
    
    tokens = es.indices.analyze(index=raw_index_name, body=body)['tokens']
    tokens = [token_info['token'] for token_info in tokens]
    return tokens    

In [37]:
snowball_settings = {
    'mappings': {
        'properties': {
            'id': {
                'type': 'integer'
            },
            'url': {
                'type': 'text'
            },
            'content': {
                'type': 'text',
                'fields': {
                    'complex': {
                        'type': 'text',
                        'analyzer': 'russian_complex'
                    }
                }
            }
        }
    },
    'settings': {
        'analysis': {
            'analyzer': {
                'russian_complex': {
                    'char_filter': [
                        'no_html'
                    ],
                    'tokenizer': 'word_longer_2',
                    'filter': [
                        'lowercase',
                        'russian_snow'
                    ]
                },
            },
            'char_filter': {
                'no_html': {
                    'type': 'html_strip',
                    "escaped_tags": []
                }
            },
            'tokenizer': {
                'word_longer_2': {
                    'type': 'pattern',
                    'pattern': '[a-zA-Z_0-9\u0400-\u04FF]{2,}',
                    'group': 0
                }
            },
            'filter': {
                'russian_snow': {
                    'type': 'snowball',
                    'language': 'russian'
                }    
            }
        }
    }
}

no_snowball_settings = {
    'mappings': {
        'properties': {
            'id': {
                'type': 'integer'
            },
            'url': {
                'type': 'text'
            },
            'content': {
                'type': 'text',
                'fields': {
                    'complex': {
                        'type': 'text',
                        'analyzer': 'russian_complex'
                    }
                }
            }
        }
    },
    'settings': {
        'analysis': {
            'analyzer': {
                'russian_complex': {
                    'char_filter': [
                        'no_html'
                    ],
                    'tokenizer': 'word_longer_2',
                    'filter': [
                        'lowercase'
                    ]
                },
            },
            'char_filter': {
                'no_html': {
                    'type': 'html_strip',
                    "escaped_tags": []
                }
            },
            'tokenizer': {
                'word_longer_2': {
                    'type': 'pattern',
                    'pattern': '[a-zA-Z_0-9\u0400-\u04FF]{2,}',
                    'group': 0
                }
            }
        }
    }
}

In [38]:
raw_snowball = 'raw_snowball'
pre_no_snowball = 'pre_no_snowball'
pre_snowball = 'pre_snowball'
lemma_no_snowball = 'lemma_no_snowball'

def recreate_all():
    recreate_index(raw_snowball, snowball_settings)
    recreate_index(pre_no_snowball, no_snowball_settings)
    recreate_index(pre_snowball, snowball_settings)
    recreate_index(lemma_no_snowball, no_snowball_settings)

In [39]:
def create_es_action(index, doc_id, document):
    return {
        '_index': index,
        '_id': doc_id,
        '_source': document
    }

In [40]:
def raw_generator(index_name):
    index = 0
    doc = {}
    for doc_id in range(10):
        with open(f'../byweb_for_course/byweb.{doc_id}.xml', 'r') as inf:
            inf.readline()
            for i in tqdm(range(20000)):
                index = index + 1
                a = inf.readline()
                b = inf.readline()
                c = inf.readline()
                doc['content'] = base64.b64decode(re.split('<.*?>', a)[2]).decode("cp1251")
                doc['url'] = base64.b64decode(re.split('<.*?>', b)[1]).decode("cp1251")
                doc['id'] = re.split('<.*?>', c)[1]
                yield create_es_action(index_name, index, doc)
                
def preprocessing_generator(index_name):
    index = 0
    doc = {}
    with open('docs.out', 'r') as inf:
        for i in tqdm(range(200000)):
            index = index + 1
            s = inf.readline().split('\t')
            doc['content'] = s[2]
            doc['url'] = s[1]
            doc['id'] = s[0]
            yield create_es_action(index_name, index, doc)
            
def lemmas_generator(index_name):
    index = 0
    doc = {}
    with open('lemmas.out', 'r') as inf:
        for i in tqdm(range(200000)):
            index = index + 1
            s = inf.readline().split('\t')
            doc['content'] = s[2]
            doc['url'] = s[1]
            doc['id'] = s[0]
            yield create_es_action(index_name, index, doc)

In [41]:
def run_generator(index_name, es_actions_generator):
    for ok, result in parallel_bulk(es, es_actions_generator(index_name), queue_size=4, thread_count=4, chunk_size=1000):
        if not ok:
            print('lol')

In [42]:
def search(index, query, *args):
    pretty_print_result(es.search(index=index, body=query, size=20), args)
    # note that size set to 20 just because default value is 10 and we know that we have 12 docs and 10 < 12 < 20
                        
def pretty_print_result(search_result, fields=[]):
    # fields is a list of fields names which we want to be printed
    res = search_result['hits']
    print(f'Total documents: {res["total"]["value"]}')
    for hit in res['hits']:
        print(f'Doc {hit["_id"]}, score is {hit["_score"]}')
        for field in fields:
            print(f'{field}: {hit["_source"][field]}')
                  
def get_doc_by_id(index, doc_id):
    return es.get(index=index, id=doc_id)['_source']

In [43]:
inf = open('../web2008_adhoc.xml', 'r', encoding='cp1251')
for i in range(9):
    inf.readline()
    
queries = dict()
for s in inf.readlines()[:-1]:
    query_text = re.split('<.*?>', s)[2]
    query_id = s[10:18]
    queries[query_id] = query_text

In [45]:
def search_docid(index, query_id):
    query = {
        'query': {
            'bool': {
                'must': {
                    'match': {
                        'content': queries[query_id]
                    }
                }
            }
        }
    }
    res = es.search(index=index, body=query, size=300)
    hits = res['hits']['hits']
    return [doc['_source']['id'] for doc in hits]
    
def precision(k, ids, rels):
    ids = ids[:k]
    if len(ids) == 0:
        return 0
    top = 0
    good = 0
    for i in ids:
        if i in rels:
            top += 1
            good += rels[i]
    if top == 0:
        return 0
    return good / top

def recall(k, ids, rels):
    ids = ids[:k]
    if len(ids) == 0:
        return 0
    good = 0
    num_good = np.sum(list(rels.values()))
    relev = min(num_good, k)
    for i in ids:
        if i in rels:
            good += rels[i]
    if relev == 0:
        return 0
    return good / relev

def mean_ap(k, ids, rels):
    ids = ids[:k]
    ps = []
    for i in ids:
        if i in rels:
            ps.append(rels[i])
    if np.sum(ps) == 0:
        return 0
    sums = np.cumsum(ps) / (np.array(range(len(ps))) + 1)
    return np.sum(sums * ps) / np.sum(ps)
    
def rprecision(ids, rels):
    num_good = np.sum(list(rels.values()))
    return precision(num_good, ids, rels)

In [65]:
mystem = ms.Mystem()
wnl = WordNetLemmatizer()

def lemmatize(a):
    lm = ''.join(mystem.lemmatize(a))
    lm = ' '.join([(wnl.lemmatize(i,j[0].lower()) if j[0].lower() in ['a','n','v'] else wnl.lemmatize(i)) for i,j in pos_tag(list(map(lambda x: x.lower(), word_tokenize(lm))))])
    return lm

def calc_stats(index_name, lemmatize_que=False):
    inf = open('../relevant_table_2009.xml')
    s = '\n'.join(inf.readlines())
    soup = bs4.BeautifulSoup(s, 'lxml')
    
    p20 = []
    r20 = []
    m_ap = []
    rp = []
    for task in soup.html.body.taskdocumentmatrix.children:
        if not isinstance(task, bs4.element.Tag):
            continue
        query_id = task['id']
        ids = search_docid(index_name, query_id)
        rels = dict()
        for doc in task.children:
            if not isinstance(doc, bs4.element.Tag):
                continue
            doc_id = doc['id']
            #if doc_id in indexed:
            rel = 1 if (doc['relevance'] == 'vital') else 0
            rels[doc_id] = rel
        rel_ids = [-1 if i not in rels else rels[i] for i in ids]
        p20.append(precision(20, ids, rels))
        r20.append(recall(20, ids, rels))
        m_ap.append(mean_ap(20, ids, rels))
        rp.append(rprecision(ids, rels))
    
    print(f'index_name = {index_name}')
    print(f'p@20 = {np.average(p20)}')
    print(f'r@20 = {np.average(r20)}')
    print(f'map = {np.average(m_ap)}')
    print(f'r-precision = {np.average(rp)}')
    
    size = es.indices.stats(index_name)['_all']['primaries']['store']['size_in_bytes']
    print(f'size = {size}')
    
    start = time()
    it = 0
    qids = list(queries.keys())[:200]
    for que in qids:
        it += 1
        if lemmatize_que:
            que = lemmatize(que)
            print(que)
        search_docid(index_name, que)
    finish = time()
    print(f'query time: {(finish - start) / 200}')
    

In [None]:
recreate_all()
start = time()
run_generator(pre_no_snowball, preprocessing_generator)
end = time()
print(f'building time = {end - start}')
calc_stats(pre_no_snowball)

In [54]:
recreate_all()
start = time()
run_generator(pre_snowball, preprocessing_generator)
end = time()
print(f'building time = {end - start}')
calc_stats(pre_snowball)


building time = 100.38830256462097
index_name = pre_snowball
p@20 = 0.352769337076931
r@20 = 0.2833510481656717
map = 0.4619860928688837
r-precision = 0.3529810102014603
size = 1006910430
query time: 0.023696205615997314


In [64]:
recreate_all()
start = time()
run_generator(lemma_no_snowball, lemmas_generator)
end = time()
print(f'building time = {end - start}')
calc_stats(lemma_no_snowball, True)


building time = 115.24400067329407
index_name = lemma_no_snowball
p@20 = 0.30379587599025454
r@20 = 0.2224024488917856
map = 0.37690104087900733
r-precision = 0.30659024445275884
size = 1079551095
query time: 0.04361544609069824
