In [42]:
%config IPCompleter.greedy=True
import re
import json
from collections import defaultdict
from tqdm import tqdm_notebook as tqdm
from elasticsearch import Elasticsearch
from elasticsearch.helpers import parallel_bulk
from pymystem3 import Mystem
from sklearn.feature_extraction.text import CountVectorizer
import requests
from time import time
import bs4
import base64
from tqdm import tqdm_notebook
import numpy as np

In [43]:
es = Elasticsearch([{'host': 'localhost', 'port': 9200, 'timeout': 360, 'maxsize': 25}])

In [44]:
raw_index_name="raw_index"

In [45]:
def recreate_index(name, settings):
    if es.indices.exists(index=name):
        es.indices.delete(index=name)
    es.indices.create(index=name, body=settings)
    
def check_analyzer(raw_index_name, analyzer, text):
    body = analyzer
    body['text'] = text
    
    tokens = es.indices.analyze(index=raw_index_name, body=body)['tokens']
    tokens = [token_info['token'] for token_info in tokens]
    return tokens    

In [10]:
snowball_settings = {
    'mappings': {
        'properties': {
            'id': {
                'type': 'integer'
            },
            'url': {
                'type': 'text'
            },
            'content': {
                'type': 'text',
                'fields': {
                    'complex': {
                        'type': 'text',
                        'analyzer': 'russian_complex'
                    }
                }
            }
        }
    },
    'settings': {
        'analysis': {
            'analyzer': {
                'russian_complex': {
                    'char_filter': [
                        'no_html'
                    ],
                    'tokenizer': 'word_longer_2',
                    'filter': [
                        'lowercase',
                        'russian_snow'
                    ]
                },
            },
            'char_filter': {
                'no_html': {
                    'type': 'html_strip',
                    "escaped_tags": []
                }
            },
            'tokenizer': {
                'word_longer_2': {
                    'type': 'pattern',
                    'pattern': '[a-zA-Z_0-9\u0400-\u04FF]{2,}',
                    'group': 0
                }
            },
            'filter': {
                'russian_snow': {
                    'type': 'snowball',
                    'language': 'russian'
                }
            }
        }
    }
}

In [11]:
recreate_index(raw_index_name, snowball_settings)

In [17]:
text = '<div><a>Мое имя!!</a></div>'

analyzer = {
    'char_filter': ['no_html'],
    'tokenizer': 'letter',
    'filter': ['lowercase']
}

check_analyzer(raw_index_name, analyzer, text)

['мое', 'имя']

In [18]:
def create_es_action(index, doc_id, document):
    return {
        '_index': index,
        '_id': doc_id,
        '_source': document
    }

In [90]:
from urllib.parse import urlparse

def links_from_doc(doc):
    links = []
    for link in doc.find_all('a'):
        if link.has_attr('href'):
            href = link.get('href')
            if href.startswith('http'):
                links.append(href)
    return links

def fill_outer_links_map(all_links, source_url, links):
    source = urlparse(source_url).netloc
    for dest_url in links:
        try:
            dest = urlparse(dest_url).netloc
            if source != dest:
                if not(source in all_links):
                    all_links[source] = {}
                if not(dest in all_links[source]):
                    all_links[source][dest] = 0
                all_links[source][dest] += 1
        except:
            pass

In [91]:
def process_page(context, page):
    doc = {}
    document = bs4.BeautifulSoup(page, "lxml")
    doc['id'] = document.docid.contents[0]
    doc['url'] = base64.b64decode(document.docurl.contents[0]).decode("cp1251")
    doc['content'] = base64.b64decode(document.content.contents[0]).decode("cp1251")
    if 'all_links' in context:
        page_data = bs4.BeautifulSoup(doc['content'], 'lxml')
        links = links_from_doc(page_data)
        fill_outer_links_map(context['all_links'], doc['url'], links)
    return doc

def es_actions_generator(context, index_name):
    for doc_id in range(1):
        with open(f'../byweb_for_course/byweb.{doc_id}.xml', 'r') as inf:
            inf.readline()
            for i in tqdm(range(20000)):
                page = inf.readline() + inf.readline() + inf.readline()
                doc = process_page(context, page)
                yield create_es_action(index_name, doc['id'], doc)

In [92]:
def run_generator(context, index_name):
    for ok, result in parallel_bulk(es, es_actions_generator(context, index_name), queue_size=4, thread_count=4, chunk_size=1000):
        if not ok:
            print(result)

In [93]:
context = {}
context['all_links'] = {}
recreate_index(raw_index_name, snowball_settings)
run_generator(context, raw_index_name)

HBox(children=(IntProgress(value=0, max=20000), HTML(value='')))




In [87]:
def search(index, query, *args):
    pretty_print_result(es.search(index=index, body=query, size=20), args)
    # note that size set to 20 just because default value is 10 and we know that we have 12 docs and 10 < 12 < 20
                        
def pretty_print_result(search_result, fields=[]):
    # fields is a list of fields names which we want to be printed
    res = search_result['hits']
    print(f'Total documents: {res["total"]["value"]}')
    for hit in res['hits']:
        print(f'Doc {hit["_id"]}, score is {hit["_score"]}')
        for field in fields:
            print(f'{field}: {hit["_source"][field]}')
                  
def get_doc_by_id(index, doc_id):
    return es.get(index=index, id=doc_id)['_source']

In [126]:
query = {
    'query': {
        'match_all': {}
    }
}

search(raw_index_name, query, 'url')

Total documents: 10000
Doc 29001, score is 1.0
url: http://www.atom.by/index.php?c=3&i=1622&ic=1282
Doc 29002, score is 1.0
url: http://www.atom.by/index.php?c=8&i=2232&ic=789
Doc 29003, score is 1.0
url: http://www.atom.by/index.php?c=8&i=634&ic=2330
Doc 29004, score is 1.0
url: http://www.atom.by/index.php?c=8&i=779&ic=663
Doc 29005, score is 1.0
url: http://www.atom.by/index.php?c=8&i=2232&ic=1736
Doc 29006, score is 1.0
url: http://www.atom.by/index.php?c=3&i=1622&ic=1912
Doc 29007, score is 1.0
url: http://www.atom.by/index.php?c=16&i=126&ic=452
Doc 29008, score is 1.0
url: http://www.atom.by/index.php?c=9&i=915&ic=430
Doc 29009, score is 1.0
url: http://www.atom.by/index.php?c=8&i=74&ic=570
Doc 29010, score is 1.0
url: http://www.atom.by/index.php?c=3&i=193&ic=1912
Doc 29011, score is 1.0
url: http://www.atom.by/index.php?c=8&i=660&ic=3714
Doc 29012, score is 1.0
url: http://www.atom.by/index.php?c=3&i=2179&ic=342
Doc 29013, score is 1.0
url: http://www.atom.by/index.php?c=3&i=21

In [18]:
def get_queries():
    inf = open('../web2008_adhoc.xml', 'r', encoding='cp1251')
    for _ in range(9):
        inf.readline()
    
    queries = dict()
    for line in inf.readlines()[:-1]:
        query_text = re.split('<.*?>', line)[2]
        query_id = line[10:18]
        queries[query_id] = query_text
    return queries

In [30]:
def create_match_query(query):
    query = {
        'query': {
            'bool': {
                'must': {
                    'match': {
                        'content': query
                    }
                }
            }
        }
    }
    return query

def search_docid(index, query):
    query = create_match_query(query)
    res = es.search(index=index, body=query, size=300)
    hits = res['hits']['hits']
    return [doc['_source']['id'] for doc in hits]

In [46]:
def build_relevance_soup():
    inf = open('../relevant_table_2009.xml')
    s = '\n'.join(inf.readlines())
    return bs4.BeautifulSoup(s, 'lxml')

def precision(k, ids, rels):
    ids = ids[:k]
    if len(ids) == 0:
        return 0
    top = 0
    good = 0
    for i in ids:
        if i in rels:
            top += 1
            good += rels[i]
    if top == 0:
        return 0
    return good / top

def recall(k, ids, rels):
    ids = ids[:k]
    if len(ids) == 0:
        return 0
    good = 0
    num_good = np.sum(list(rels.values()))
    relev = min(num_good, k)
    for i in ids:
        if i in rels:
            good += rels[i]
    if relev == 0:
        return 0
    return good / relev

def mean_ap(k, ids, rels):
    ids = ids[:k]
    ps = []
    for i in ids:
        if i in rels:
            ps.append(rels[i])
    if np.sum(ps) == 0:
        return 0
    sums = np.cumsum(ps) / (np.array(range(len(ps))) + 1)
    return np.sum(sums * ps) / np.sum(ps)
    
def rprecision(ids, rels):
    num_good = np.sum(list(rels.values()))
    return precision(num_good, ids, rels)

def run_queries(index, queries, relevance_soup):
    p20, r20, m_ap, rp = [], [], [], []
    for task in tqdm_notebook(relevance_soup.html.body.taskdocumentmatrix.children):
        if not isinstance(task, bs4.element.Tag):
            continue
        query_id = task['id']
        if not(query_id in queries):
            continue
        ids = search_docid(index, queries[query_id])
        rels = dict()
        for doc in task.children:
            if not isinstance(doc, bs4.element.Tag):
                continue
            doc_id = doc['id']
            rel = 1 if (doc['relevance'] == 'vital') else 0
            rels[doc_id] = rel
        p20.append(precision(20, ids, rels))
        r20.append(recall(20, ids, rels))
        m_ap.append(mean_ap(20, ids, rels))
        rp.append(rprecision(ids, rels))
    return { 
        'precision': np.average(p20), 
        'recall': np.average(r20), 
        'map_20': np.average(m_ap), 
        'r-precision': np.average(rp)
    }

In [None]:
queries = get_queries()
relevance_soup = build_relevance_soup()
run_queries('raw_index', queries, relevance_soup)

HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

In [94]:
context

{'all_links': {'forum.linux.by': {'www.linux.by': 225,
   'forum.4minsk.com': 45,
   'www.google.com': 45,
   'beloffice.by': 5,
   'wwp.icq.com': 111,
   'www.gnome.org': 1,
   'www.avilink.net': 45,
   'www.phpbb.com': 45,
   'br.by': 45,
   'top.list.ru': 45,
   'catalog.aport.ru': 45,
   'www.rarlab.com': 1,
   'google.com': 2,
   'pogudo.org': 3,
   'linux-hotplug.sourceforge.net': 1,
   'www.cpp.com.ua': 1,
   'www.mycomputer.ua': 2,
   'www.osnews.com': 1,
   'linux.by': 13,
   'www.macaulinux.org': 1,
   'belarus.avtonom.org': 1,
   'cvs.gna.org': 1,
   'www.linuxcenter.ru': 2,
   'd4s.linux.by': 2,
   'short-trips.net': 2,
   'neonet.cjb.net': 1,
   'www.mlug.linux.by': 1,
   'onemal.joy.by': 1,
   'victorgr.livejournal.com': 3,
   'vary.ru': 3,
   'mlug.linux.by': 10,
   'wackowiki.com': 1,
   'www.nixp.ru': 1,
   'edit.yahoo.com': 1,
   'www.mova.org': 1,
   'belregneft.com': 1,
   'www.slackware.at': 1,
   'www.shender.balta.od.ua': 1,
   'httpd.apache.org': 1,
   'www.tech