In [19]:
%config IPCompleter.greedy=True
import re
import json
from collections import defaultdict
from tqdm import tqdm_notebook as tqdm
from elasticsearch import Elasticsearch
from elasticsearch.helpers import parallel_bulk
from pymystem3 import Mystem
from sklearn.feature_extraction.text import CountVectorizer
import requests
from time import time
import bs4
import base64
from tqdm import tqdm_notebook
import numpy as np
import networkx as nx

In [2]:
es = Elasticsearch([{'host': 'localhost', 'port': 9200, 'timeout': 360, 'maxsize': 25}])

In [3]:
raw_index_name="raw_index"
snowball_index_name="raw_index"

In [4]:
def recreate_index(name, settings):
    if es.indices.exists(index=name):
        es.indices.delete(index=name)
    es.indices.create(index=name, body=settings)
    
def check_analyzer(raw_index_name, analyzer, text):
    body = analyzer
    body['text'] = text
    
    tokens = es.indices.analyze(index=raw_index_name, body=body)['tokens']
    tokens = [token_info['token'] for token_info in tokens]
    return tokens    

In [5]:
snowball_settings = {
    'mappings': {
        'properties': {
            'id': {
                'type': 'integer'
            },
            'url': {
                'type': 'text'
            },
            'content': {
                'type': 'text',
                'fields': {
                    'complex': {
                        'type': 'text',
                        'analyzer': 'russian_complex'
                    }
                }
            }
        }
    },
    'settings': {
        'analysis': {
            'analyzer': {
                'russian_complex': {
                    'char_filter': [
                        'no_html'
                    ],
                    'tokenizer': 'word_longer_2',
                    'filter': [
                        'lowercase',
                        'russian_snow'
                    ]
                },
            },
            'char_filter': {
                'no_html': {
                    'type': 'html_strip',
                    "escaped_tags": []
                }
            },
            'tokenizer': {
                'word_longer_2': {
                    'type': 'pattern',
                    'pattern': '[a-zA-Z_0-9\u0400-\u04FF]{2,}',
                    'group': 0
                }
            },
            'filter': {
                'russian_snow': {
                    'type': 'snowball',
                    'language': 'russian'
                }
            }
        }
    }
}

default_settings = {
    'mappings': {
        'properties': {
            'id': {
                'type': 'integer'
            },
            'url': {
                'type': 'text'
            },
            'content': {
                'type': 'text',
                'fields': {
                    'complex': {
                        'type': 'text',
                        'analyzer': 'russian_complex'
                    }
                }
            }
        }
    },
    'settings': {
        'analysis': {
            'analyzer': {
                'russian_complex': {
                    'char_filter': [
                        'no_html'
                    ],
                    'tokenizer': 'word_longer_2',
                    'filter': [
                        'lowercase'
                    ]
                },
            },
            'char_filter': {
                'no_html': {
                    'type': 'html_strip',
                    "escaped_tags": []
                }
            },
            'tokenizer': {
                'word_longer_2': {
                    'type': 'pattern',
                    'pattern': '[a-zA-Z_0-9\u0400-\u04FF]{2,}',
                    'group': 0
                }
            }
        }
    }
}

In [6]:
recreate_index(raw_index_name, default_settings)
recreate_index(snowball_index_name, snowball_settings)

In [7]:
text = '<div><a>Моего имя!!</a></div>'

analyzer = {
    'char_filter': ['no_html'],
    'tokenizer': 'letter',
    'filter': ['lowercase']
}

check_analyzer(raw_index_name, analyzer, text)

['моего', 'имя']

In [8]:
def create_es_action(index, doc_id, document):
    return {
        '_index': index,
        '_id': doc_id,
        '_source': document
    }

In [9]:
from urllib.parse import urlparse

def links_from_doc(doc):
    links = []
    for link in doc.find_all('a'):
        if link.has_attr('href'):
            href = link.get('href')
            if href.startswith('http'):
                links.append(href)
    return links

def fill_outer_links_map(all_links, source_url, links):
    source = urlparse(source_url).netloc
    for dest_url in links:
        try:
            dest = urlparse(dest_url).netloc
            if source != dest:
                if not(source in all_links):
                    all_links[source] = {}
                if not(dest in all_links[source]):
                    all_links[source][dest] = 0
                all_links[source][dest] += 1
        except:
            pass

In [10]:
def process_page(context, page):
    doc = {}
    document = bs4.BeautifulSoup(page, "lxml")
    doc['id'] = document.docid.contents[0]
    doc['url'] = base64.b64decode(document.docurl.contents[0]).decode("cp1251")
    doc['content'] = base64.b64decode(document.content.contents[0]).decode("cp1251")
    if 'all_links' in context:
        page_data = bs4.BeautifulSoup(doc['content'], 'lxml')
        links = links_from_doc(page_data)
        fill_outer_links_map(context['all_links'], doc['url'], links)
    return doc

def es_actions_generator(context, index_name):
    for doc_id in range(10):
        with open(f'../byweb_for_course/byweb.{doc_id}.xml', 'r') as inf:
            inf.readline()
            for i in tqdm(range(20000)):
                page = inf.readline() + inf.readline() + inf.readline()
                doc = process_page(context, page)
                yield create_es_action(index_name, doc['id'], doc)

In [11]:
def run_generator(context, index_name):
    for ok, result in parallel_bulk(es, es_actions_generator(context, index_name), queue_size=4, thread_count=4, chunk_size=1000):
        if not ok:
            print(result)

In [12]:
context = {}
context['all_links'] = {}
recreate_index(raw_index_name, default_settings)
recreate_index(snowball_index_name, snowball_settings)

print('Filling raw index')
run_generator(context, raw_index_name)

#print('Filling snowball index')
#run_generator(context, snowball_index_name)

Filling raw index


HBox(children=(IntProgress(value=0, max=20000), HTML(value='')))




HBox(children=(IntProgress(value=0, max=20000), HTML(value='')))




HBox(children=(IntProgress(value=0, max=20000), HTML(value='')))




HBox(children=(IntProgress(value=0, max=20000), HTML(value='')))




HBox(children=(IntProgress(value=0, max=20000), HTML(value='')))




HBox(children=(IntProgress(value=0, max=20000), HTML(value='')))




HBox(children=(IntProgress(value=0, max=20000), HTML(value='')))




HBox(children=(IntProgress(value=0, max=20000), HTML(value='')))




HBox(children=(IntProgress(value=0, max=20000), HTML(value='')))




HBox(children=(IntProgress(value=0, max=20000), HTML(value='')))




In [87]:
def search(index, query, *args):
    pretty_print_result(es.search(index=index, body=query, size=20), args)
    # note that size set to 20 just because default value is 10 and we know that we have 12 docs and 10 < 12 < 20
                        
def pretty_print_result(search_result, fields=[]):
    # fields is a list of fields names which we want to be printed
    res = search_result['hits']
    print(f'Total documents: {res["total"]["value"]}')
    for hit in res['hits']:
        print(f'Doc {hit["_id"]}, score is {hit["_score"]}')
        for field in fields:
            print(f'{field}: {hit["_source"][field]}')
                  
def get_doc_by_id(index, doc_id):
    return es.get(index=index, id=doc_id)['_source']

In [126]:
query = {
    'query': {
        'match_all': {}
    }
}

search(raw_index_name, query, 'url')

Total documents: 10000
Doc 29001, score is 1.0
url: http://www.atom.by/index.php?c=3&i=1622&ic=1282
Doc 29002, score is 1.0
url: http://www.atom.by/index.php?c=8&i=2232&ic=789
Doc 29003, score is 1.0
url: http://www.atom.by/index.php?c=8&i=634&ic=2330
Doc 29004, score is 1.0
url: http://www.atom.by/index.php?c=8&i=779&ic=663
Doc 29005, score is 1.0
url: http://www.atom.by/index.php?c=8&i=2232&ic=1736
Doc 29006, score is 1.0
url: http://www.atom.by/index.php?c=3&i=1622&ic=1912
Doc 29007, score is 1.0
url: http://www.atom.by/index.php?c=16&i=126&ic=452
Doc 29008, score is 1.0
url: http://www.atom.by/index.php?c=9&i=915&ic=430
Doc 29009, score is 1.0
url: http://www.atom.by/index.php?c=8&i=74&ic=570
Doc 29010, score is 1.0
url: http://www.atom.by/index.php?c=3&i=193&ic=1912
Doc 29011, score is 1.0
url: http://www.atom.by/index.php?c=8&i=660&ic=3714
Doc 29012, score is 1.0
url: http://www.atom.by/index.php?c=3&i=2179&ic=342
Doc 29013, score is 1.0
url: http://www.atom.by/index.php?c=3&i=21

In [18]:
def get_queries():
    inf = open('../web2008_adhoc.xml', 'r', encoding='cp1251')
    for _ in range(9):
        inf.readline()
    
    queries = dict()
    for line in inf.readlines()[:-1]:
        query_text = re.split('<.*?>', line)[2]
        query_id = line[10:18]
        queries[query_id] = query_text
    return queries

In [30]:
def create_match_query(query):
    query = {
        'query': {
            'bool': {
                'must': {
                    'match': {
                        'content': query
                    }
                }
            }
        }
    }
    return query

def search_docid(index, query):
    query = create_match_query(query)
    res = es.search(index=index, body=query, size=300)
    hits = res['hits']['hits']
    return [doc['_source']['id'] for doc in hits]

In [46]:
def build_relevance_soup():
    inf = open('../relevant_table_2009.xml')
    s = '\n'.join(inf.readlines())
    return bs4.BeautifulSoup(s, 'lxml')

def precision(k, ids, rels):
    ids = ids[:k]
    if len(ids) == 0:
        return 0
    top = 0
    good = 0
    for i in ids:
        if i in rels:
            top += 1
            good += rels[i]
    if top == 0:
        return 0
    return good / top

def recall(k, ids, rels):
    ids = ids[:k]
    if len(ids) == 0:
        return 0
    good = 0
    num_good = np.sum(list(rels.values()))
    relev = min(num_good, k)
    for i in ids:
        if i in rels:
            good += rels[i]
    if relev == 0:
        return 0
    return good / relev

def mean_ap(k, ids, rels):
    ids = ids[:k]
    ps = []
    for i in ids:
        if i in rels:
            ps.append(rels[i])
    if np.sum(ps) == 0:
        return 0
    sums = np.cumsum(ps) / (np.array(range(len(ps))) + 1)
    return np.sum(sums * ps) / np.sum(ps)
    
def rprecision(ids, rels):
    num_good = np.sum(list(rels.values()))
    return precision(num_good, ids, rels)

def run_queries(index, queries, relevance_soup):
    p20, r20, m_ap, rp = [], [], [], []
    for task in tqdm_notebook(relevance_soup.html.body.taskdocumentmatrix.children):
        if not isinstance(task, bs4.element.Tag):
            continue
        query_id = task['id']
        if not(query_id in queries):
            continue
        ids = search_docid(index, queries[query_id])
        rels = dict()
        for doc in task.children:
            if not isinstance(doc, bs4.element.Tag):
                continue
            doc_id = doc['id']
            rel = 1 if (doc['relevance'] == 'vital') else 0
            rels[doc_id] = rel
        p20.append(precision(20, ids, rels))
        r20.append(recall(20, ids, rels))
        m_ap.append(mean_ap(20, ids, rels))
        rp.append(rprecision(ids, rels))
    return { 
        'precision': np.average(p20), 
        'recall': np.average(r20), 
        'map_20': np.average(m_ap), 
        'r-precision': np.average(rp)
    }

In [None]:
queries = get_queries()
relevance_soup = build_relevance_soup()
run_queries('raw_index', queries, relevance_soup)

HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

In [14]:
with open("../links.json", "w") as links_file:
    json.dump(context['all_links'], links_file)

In [20]:
def pagerank(G, alpha=0.85, personalization=None, max_iter=100, tol=1.0e-6, nstart=None, weight='weight', dangling=None): 
	"""Return the PageRank of the nodes in the graph. 

	PageRank computes a ranking of the nodes in the graph G based on 
	the structure of the incoming links. It was originally designed as 
	an algorithm to rank web pages. 

	Parameters 
	---------- 
	G : graph 
	A NetworkX graph. Undirected graphs will be converted to a directed 
	graph with two directed edges for each undirected edge. 

	alpha : float, optional 
	Damping parameter for PageRank, default=0.85. 

	personalization: dict, optional 
	The "personalization vector" consisting of a dictionary with a 
	key for every graph node and nonzero personalization value for each node. 
	By default, a uniform distribution is used. 

	max_iter : integer, optional 
	Maximum number of iterations in power method eigenvalue solver. 

	tol : float, optional 
	Error tolerance used to check convergence in power method solver. 

	nstart : dictionary, optional 
	Starting value of PageRank iteration for each node. 

	weight : key, optional 
	Edge data key to use as weight. If None weights are set to 1. 

	dangling: dict, optional 
	The outedges to be assigned to any "dangling" nodes, i.e., nodes without 
	any outedges. The dict key is the node the outedge points to and the dict 
	value is the weight of that outedge. By default, dangling nodes are given 
	outedges according to the personalization vector (uniform if not 
	specified). This must be selected to result in an irreducible transition 
	matrix (see notes under google_matrix). It may be common to have the 
	dangling dict to be the same as the personalization dict. 

	Returns 
	------- 
	pagerank : dictionary 
	Dictionary of nodes with PageRank as value 

	Notes 
	----- 
	The eigenvector calculation is done by the power iteration method 
	and has no guarantee of convergence. The iteration will stop 
	after max_iter iterations or an error tolerance of 
	number_of_nodes(G)*tol has been reached. 

	The PageRank algorithm was designed for directed graphs but this 
	algorithm does not check if the input graph is directed and will 
	execute on undirected graphs by converting each edge in the 
	directed graph to two edges. 

	
	"""
    if len(G) == 0: 
        return {} 

    if not G.is_directed(): 
        D = G.to_directed() 
    else: 
        D = G 

	# Create a copy in (right) stochastic form 
	W = nx.stochastic_graph(D, weight=weight) 
	N = W.number_of_nodes() 

	# Choose fixed starting vector if not given 
	if nstart is None: 
		x = dict.fromkeys(W, 1.0 / N) 
	else: 
		# Normalized nstart vector 
		s = float(sum(nstart.values())) 
		x = dict((k, v / s) for k, v in nstart.items()) 

	if personalization is None: 

		# Assign uniform personalization vector if not given 
		p = dict.fromkeys(W, 1.0 / N) 
	else: 
		missing = set(G) - set(personalization) 
		if missing: 
			raise NetworkXError('Personalization dictionary '
								'must have a value for every node. '
								'Missing nodes %s' % missing) 
		s = float(sum(personalization.values())) 
		p = dict((k, v / s) for k, v in personalization.items()) 

	if dangling is None: 

		# Use personalization vector if dangling vector not specified 
		dangling_weights = p 
	else: 
		missing = set(G) - set(dangling) 
		if missing: 
			raise NetworkXError('Dangling node dictionary '
								'must have a value for every node. '
								'Missing nodes %s' % missing) 
		s = float(sum(dangling.values())) 
		dangling_weights = dict((k, v/s) for k, v in dangling.items()) 
	dangling_nodes = [n for n in W if W.out_degree(n, weight=weight) == 0.0] 

	# power iteration: make up to max_iter iterations 
	for _ in range(max_iter): 
		xlast = x 
		x = dict.fromkeys(xlast.keys(), 0) 
		danglesum = alpha * sum(xlast[n] for n in dangling_nodes) 
		for n in x: 

			# this matrix multiply looks odd because it is 
			# doing a left multiply x^T=xlast^T*W 
			for nbr in W[n]: 
				x[nbr] += alpha * xlast[n] * W[n][nbr][weight] 
			x[n] += danglesum * dangling_weights[n] + (1.0 - alpha) * p[n] 

		# check convergence, l1 norm 
		err = sum([abs(x[n] - xlast[n]) for n in x]) 
		if err < N*tol: 
			return x 
	raise NetworkXError('pagerank: power iteration failed to converge '
						'in %d iterations.' % max_iter) 



In [25]:
def get_graph(all_links):
    G = nx.DiGraph()
    for source in all_links:
        for dest in all_links[source]:
            G.add_edge(source, dest)
    return G

In [26]:
pagerank(get_graph(context['all_links']))

{'forum.linux.by': 2.3650844376701482e-05,
 'www.linux.by': 1.3927620321214575e-05,
 'forum.4minsk.com': 1.1043773845920815e-05,
 'www.google.com': 9.708035970802984e-05,
 'beloffice.by': 1.0782181568619056e-05,
 'wwp.icq.com': 0.0001917320473010637,
 'www.gnome.org': 1.087448430080129e-05,
 'www.avilink.net': 1.3839518378086974e-05,
 'www.phpbb.com': 0.00021304668977953142,
 'br.by': 4.7108277695884215e-05,
 'top.list.ru': 6.004061353585748e-05,
 'catalog.aport.ru': 1.1976507588092472e-05,
 'www.rarlab.com': 1.3454425179028217e-05,
 'google.com': 0.0001492825855326611,
 'pogudo.org': 1.0982950405510333e-05,
 'linux-hotplug.sourceforge.net': 1.0782181568619056e-05,
 'www.cpp.com.ua': 1.1432502881877203e-05,
 'www.mycomputer.ua': 1.0782181568619056e-05,
 'www.osnews.com': 1.0782181568619056e-05,
 'linux.by': 1.3367675805412043e-05,
 'www.macaulinux.org': 1.0782181568619056e-05,
 'belarus.avtonom.org': 1.0792734587175481e-05,
 'cvs.gna.org': 1.0782181568619056e-05,
 'www.linuxcenter.ru':