## Read and assemble results

In [1]:
RESULT_DIR = 'output'

## Read Results

In [6]:
import os
import json
from os import path

blog_jsons = []
post_comments = {}
for json_file in os.listdir(RESULT_DIR):
    uri = path.join(RESULT_DIR,json_file)
    item = json.loads(open(uri).read())
    if item['type'] == 'post':
        post_comments[item['post_url']] = item['comment_urls']
    else:
        blog_jsons.append(item)

## Function to convert to desired format

In [7]:
def normalize_blog_url(url):
    try:
        return url[:(url.index('blog.ir') + len('blog.ir'))]
    except ValueError:
        return None

def convert_blog(item):
    blog_object = {}
    blog_object['url'] = normalize_blog_url(item['blog_url'])
    if not blog_object['url']:
        return None
    blog_object['title'] = item['blog_name']
    blog_object['posts'] = []
    for i in range(1,10):
        if ('post_content_%d' % i) in item:
            post_object = {}
            post_object['post_content'] = item['post_content_%d' % i]
            post_object['post_url'] = item['post_url_%d' % i]
            post_object['post_title'] = item['post_title_%d' % i]
            post_object['post_comments'] = []
            for comment in post_comments.get(post_object['post_url'], []):
                normalized_comment_url = normalize_blog_url(comment)
                if normalized_comment_url:
                    post_object['post_comments'].append({'comment_url': normalized_comment_url})
            blog_object['posts'].append(post_object)
    blog_object = {'blog': blog_object}
    #print(json.dumps(blog_object,indent=2,ensure_ascii=False))
    return blog_object
    

## Initialize Elastic Client

In [1]:
from elasticsearch import Elasticsearch

es = Elasticsearch(hosts=['127.0.0.1'])

## Create Index

In [2]:
INDEX_NAME = 'blog_index'

## Remove everything from elasticsearch and start again

In [15]:
es.indices.delete(index=INDEX_NAME)

es.indices.create(index=INDEX_NAME, ignore=400)

{'acknowledged': True}

## Make blogs objects list

In [18]:
blog_objects = [convert_blog(item) for item in blog_jsons]
blog_objects = [blog for blog in blog_objects if blog]

## Add to index

In [19]:
for item in blog_objects:
    es.index(index=INDEX_NAME, doc_type="blog", id=item['blog']['url'], body=item)

## Compute PageRank P Matrix

In [20]:
import numpy as np

def compute_blog_mapping(blog_objects):
    mapping = {}
    reverse_mapping = []
    ctr = 0
    for blog in blog_objects:
        mapping[blog['blog']['url']] = ctr
        reverse_mapping.append(blog['blog']['url'])
        ctr += 1
    return mapping, reverse_mapping, ctr


def compute_page_rank(blog_objects, alpha):
    mapping, reverse_mapping, n_blogs = compute_blog_mapping(blog_objects)
    p_matrix = np.zeros(shape=(n_blogs, n_blogs), dtype=float)
    for blog in blog_objects:
        this_blog = mapping[blog['blog']['url']]
        for post in blog['blog']['posts']:
            for comment in post['post_comments']:
                if comment['comment_url'] in mapping:
                    neighbor_blog = mapping[comment['comment_url']]
                    p_matrix[neighbor_blog, this_blog] = 1
    for i in range(n_blogs):
        if p_matrix[i].sum() == 0:
            p_matrix[i] = np.ones(n_blogs) / n_blogs
        else:
            p_matrix[i] = (p_matrix[i] / p_matrix[i].sum()) * (1 - alpha) + (np.ones(n_blogs) / n_blogs) * alpha
    eigenvals, eigenvecs = np.linalg.eig(p_matrix.T)
    left_pricipal_eigen_vec = np.real(eigenvecs[:, eigenvals.argmax()].T)
    page_rank = left_pricipal_eigen_vec / left_pricipal_eigen_vec.sum()
    return dict(zip(reverse_mapping, page_rank))

In [21]:
page_rank = compute_page_rank(blog_objects, 0.1)
for url in page_rank:
    es.update(index=INDEX_NAME, doc_type='blog', id=url, body={'doc': {'blog': {'page_rank': page_rank[url]}}})

## Search

In [3]:
def search(query, weights={}, pr_weight=0):
    n_docs = es.count(index=INDEX_NAME, doc_type='blog')['count']
    body_query = {
        'query': {
            'function_score': {
                'query': {
                    'bool': {
                        'should': []
                    }
                },
                'functions': [
                    {
                        'field_value_factor': {
                            'field': 'blog.page_rank',
                            'factor': n_docs * pr_weight,
                        },
                    },
                ],
                'boost_mode': 'sum',
            }
         }
    }
    for field in query:
        body_query['query']['function_score']['query']['bool']['should'].append({
            'match': { 
                'blog.' + field: {
                    'query': query[field],
                    'boost': weights.get(field, 1),
                },
            },
        })
    res = es.search(index=INDEX_NAME, doc_type='blog', body=body_query)
    return [(hit['_source']['blog']['url'], hit['_source']['blog']['title'], hit['_source']['blog']['page_rank'],  hit['_score']) for hit in res['hits']['hits']]

In [4]:
search(query={'title': 'گاه نوشت های من', 'url': 'best'}, pr_weight=1)

[('http://raha1100.blog.ir', '✗من و تنهایی✗', 0.014597401011421764, 17.519613),
 ('http://best-world.blog.ir',
  'گاه نوشت های من',
  0.0004736475716262781,
  16.6216),
 ('http://hesarman.blog.ir', 'حصار من', 0.012475031818105308, 15.336618),
 ('http://mahboobeh-k.blog.ir',
  'گاه نوشت های من',
  0.0005445151140428768,
  11.896838),
 ('http://1moallem.blog.ir',
  'گاه نوشت یک خانم معلم',
  0.00011581067676829718,
  7.1706457),
 ('http://mahmoudbanaei.blog.ir',
  'گاه نگاری های یک مهندس',
  0.0009483596977009462,
  6.5440683),
 ('http://l-mydaily-l.blog.ir',
  'گاه نوشت هایِ یک عددپشت کنکوری📚',
  0.00011581067676829718,
  6.5389566),
 ('http://history1391.blog.ir',
  'گاه نوشته\u200cهای من',
  0.0010737227146008332,
  6.5168233),
 ('http://hdana.blog.ir', 'نوشته های خیس من', 0.001051115065020101, 6.14952),
 ('http://dailylife.blog.ir',
  'ذهن زیبای من',
  0.0037071028491997707,
  6.1379056)]

## Testing

Don't forget to run this for the first time:
```
jupyter nbextension enable --py widgetsnbextension
```

In [15]:
from ipywidgets import widgets
from IPython.display import display, clear_output



title_input = widgets.Text(description='Title')
url_input = widgets.Text(description='Url')
display(title_input)
display(url_input)
text_output = widgets.Output()
display(text_output)


def handle_submit(sender):
    with text_output:
        clear_output()
        for result in search(query={'title': title_input.value, 'url': url_input.value}, pr_weight=1):
            print(result)

title_input.on_submit(handle_submit)
url_input.on_submit(handle_submit)