## Read and assemble results

In [1]:
RESULT_DIR = 'output'

## Read Results

In [3]:
import os
import json
from os import path

blog_jsons = []
post_comments = {}
for json_file in os.listdir(RESULT_DIR):
    if json_file.endswith('.json'):
        uri = path.join(RESULT_DIR,json_file)
        item = json.loads(open(uri).read())
        if item['type'] == 'post':
            post_comments[item['post_url']] = item['comment_urls']
        else:
            blog_jsons.append(item)

## Function to convert to desired format

In [7]:
def normalize_blog_url(url):
    try:
        return url[:(url.index('blog.ir') + len('blog.ir'))]
    except ValueError:
        return None

def convert_blog(item):
    blog_object = {}
    blog_object['url'] = normalize_blog_url(item['blog_url'])
    if not blog_object['url']:
        return None
    blog_object['title'] = item['blog_name']
    blog_object['posts'] = []
    for i in range(1,10):
        if ('post_content_%d' % i) in item:
            post_object = {}
            post_object['post_content'] = item['post_content_%d' % i]
            post_object['post_url'] = item['post_url_%d' % i]
            post_object['post_title'] = item['post_title_%d' % i]
            post_object['post_comments'] = []
            for comment in post_comments.get(post_object['post_url'], []):
                normalized_comment_url = normalize_blog_url(comment)
                if normalized_comment_url:
                    post_object['post_comments'].append({'comment_url': normalized_comment_url})
            blog_object['posts'].append(post_object)
    blog_object = {'blog': blog_object}
    #print(json.dumps(blog_object,indent=2,ensure_ascii=False))
    return blog_object
    

## Initialize Elastic Client

In [4]:
from elasticsearch import Elasticsearch

es = Elasticsearch(hosts=['127.0.0.1'])

## Create Index

In [5]:
INDEX_NAME = 'blog_index'

## Remove everything from elasticsearch and start again

In [15]:
es.indices.delete(index=INDEX_NAME)

es.indices.create(index=INDEX_NAME, ignore=400)

{'acknowledged': True}

## Make blogs objects list

In [18]:
blog_objects = [convert_blog(item) for item in blog_jsons]
blog_objects = [blog for blog in blog_objects if blog]

## Add to index

In [19]:
for item in blog_objects:
    es.index(index=INDEX_NAME, doc_type="blog", id=item['blog']['url'], body=item)

## Compute PageRank P Matrix

In [20]:
import numpy as np

def compute_blog_mapping(blog_objects):
    mapping = {}
    reverse_mapping = []
    ctr = 0
    for blog in blog_objects:
        mapping[blog['blog']['url']] = ctr
        reverse_mapping.append(blog['blog']['url'])
        ctr += 1
    return mapping, reverse_mapping, ctr


def compute_page_rank(blog_objects, alpha):
    mapping, reverse_mapping, n_blogs = compute_blog_mapping(blog_objects)
    p_matrix = np.zeros(shape=(n_blogs, n_blogs), dtype=float)
    for blog in blog_objects:
        this_blog = mapping[blog['blog']['url']]
        for post in blog['blog']['posts']:
            for comment in post['post_comments']:
                if comment['comment_url'] in mapping:
                    neighbor_blog = mapping[comment['comment_url']]
                    p_matrix[neighbor_blog, this_blog] = 1
    for i in range(n_blogs):
        if p_matrix[i].sum() == 0:
            p_matrix[i] = np.ones(n_blogs) / n_blogs
        else:
            p_matrix[i] = (p_matrix[i] / p_matrix[i].sum()) * (1 - alpha) + (np.ones(n_blogs) / n_blogs) * alpha
    eigenvals, eigenvecs = np.linalg.eig(p_matrix.T)
    left_pricipal_eigen_vec = np.real(eigenvecs[:, eigenvals.argmax()].T)
    page_rank = left_pricipal_eigen_vec / left_pricipal_eigen_vec.sum()
    return dict(zip(reverse_mapping, page_rank))

In [21]:
page_rank = compute_page_rank(blog_objects, 0.1)
for url in page_rank:
    es.update(index=INDEX_NAME, doc_type='blog', id=url, body={'doc': {'blog': {'page_rank': page_rank[url]}}})

## Search

In [7]:
def search(query, weights={}, pr_weight=0):
    n_docs = es.count(index=INDEX_NAME, doc_type='blog')['count']
    body_query = {
        'query': {
            'function_score': {
                'query': {
                    'bool': {
                        'should': []
                    }
                },
                'functions': [
                    {
                        'field_value_factor': {
                            'field': 'blog.page_rank',
                            'factor': n_docs * pr_weight,
                        },
                    },
                ],
                'boost_mode': 'sum',
            }
         }
    }
    for field in query:
        body_query['query']['function_score']['query']['bool']['should'].append({
            'match': { 
                'blog.' + field: {
                    'query': query[field],
                    'boost': weights.get(field, 1),
                },
            },
        })
    res = es.search(index=INDEX_NAME, doc_type='blog', body=body_query)
    return [(hit['_source']['blog']['url'], hit['_source']['blog']['title'], hit['_source']['blog']['page_rank'],  hit['_score']) for hit in res['hits']['hits']]

In [8]:
search(query={'title': 'گاه نوشت های من', 'url': 'best'}, pr_weight=1)

[('https://roooozha.blog.ir',
  'روز های دلتنگی من',
  0.005711053963404094,
  3.5746975),
 ('https://razhayema.blog.ir', 'راز های ما', 0.022556263552940464, 2.8915126),
 ('https://mydreamylife.blog.ir',
  '!…سه نقطه های دل لیمو...!',
  0.022556263552940468,
  2.6825473),
 ('https://dinky28.blog.ir',
  '"سکوت من صدای تو"',
  0.016476137982917884,
  2.234666),
 ('https://ghahrman.blog.ir',
  'روزهای زندگی من',
  0.005711053963404085,
  2.0005362),
 ('https://hejabe-iroony.blog.ir',
  '...بنده های خوشتیپِ خدا',
  0.005711053963404085,
  1.9650261),
 ('https://mannevisi.blog.ir', 'من نویس', 0.005711053963404085, 1.6623679),
 ('https://masirika.blog.ir', 'یه خورده من!', 0.005711053963404085, 1.6425865),
 ('https://i-am-a-muslim-girl.blog.ir',
  'من یک دختر مسلمانم',
  0.005711053963404085,
  1.4120319)]

## Testing

Don't forget to run this for the first time:
```
jupyter nbextension enable --py widgetsnbextension
```

In [13]:
from ipywidgets import widgets
from IPython.display import display, clear_output



title_input = widgets.Text(description='Blog Title')
url_input = widgets.Text(description='Blog Url')
post_title_input = widgets.Text(description='Post Title')
post_content_input = widgets.Text(description='Post Content')
display(title_input)
display(url_input)
display(post_title_input)
display(post_content_input)

title_input_weight = widgets.FloatSlider(description='Blog Title Weight', min=0, max=10, step=0.1, value=1)
url_input_weight = widgets.FloatSlider(description='Blog Url Weight', min=0, max=10, step=0.1, value=1)
post_title_input_weight = widgets.FloatSlider(description='Post Title Weight', min=0, max=10, step=0.1, value=1)
post_content_input_weight = widgets.FloatSlider(description='Post Content Weight', min=0, max=10, step=0.1, value=1)
page_rank_input_weight = widgets.FloatSlider(description='Page Rank Weight', min=0, max=10, step=0.1, value=1)
display(title_input_weight)
display(url_input_weight)
display(post_title_input_weight)
display(post_content_input_weight)
display(page_rank_input_weight)

text_output = widgets.Output()
display(text_output)


def handle_submit(sender):
    with text_output:
        clear_output()
        for result in search(query={'title': title_input.value,
                                  'url': url_input.value,
                                  'posts.post_title': post_title_input.value,
                                  'posts.post_content': post_content_input.value},
                           weights={'title': title_input_weight.value,
                                    'url': url_input_weight.value,
                                    'posts.post_title': post_title_input_weight.value,
                                    'posts.post_content': post_content_input_weight.value},
                           pr_weight=page_rank_input_weight.value):
            print(result)

title_input.on_submit(handle_submit)
url_input.on_submit(handle_submit)
post_title_input.on_submit(handle_submit)
post_content_input.on_submit(handle_submit)