In [1]:
%config IPCompleter.greedy=True
import re
import json
from collections import defaultdict
from tqdm import tqdm_notebook as tqdm
from elasticsearch import Elasticsearch
from elasticsearch.helpers import parallel_bulk
from pymystem3 import Mystem
from sklearn.feature_extraction.text import CountVectorizer
import requests
from time import time
import bs4
import base64
from tqdm import tqdm_notebook

In [2]:
es = Elasticsearch([{'host': 'localhost', 'port': 9200, 'timeout': 360, 'maxsize': 25}])

In [8]:
raw_index_name="raw_index"

In [9]:
def recreate_index(name, settings):
    if es.indices.exists(index=name):
        es.indices.delete(index=name)
    es.indices.create(index=name, body=settings)
    
def check_analyzer(raw_index_name, analyzer, text):
    body = analyzer
    body['text'] = text
    
    tokens = es.indices.analyze(index=raw_index_name, body=body)['tokens']
    tokens = [token_info['token'] for token_info in tokens]
    return tokens    

In [10]:
settings = {
    'mappings': {
        'properties': {
            'id': {
                'type': 'integer'
            },
            'url': {
                'type': 'text'
            },
            'content': {
                'type': 'text',
                'fields': {
                    'complex': {
                        'type': 'text',
                        'analyzer': 'russian_complex'
                    }
                }
            }
        }
    },
    'settings': {
        'analysis': {
            'analyzer': {
                'russian_complex': {
                    'char_filter': [
                        'no_html'
                    ],
                    'tokenizer': 'word_longer_2',
                    'filter': [
                        'lowercase',
                        'russian_snow'
                    ]
                },
            },
            'char_filter': {
                'no_html': {
                    'type': 'html_strip',
                    "escaped_tags": []
                }
            },
            'tokenizer': {
                'word_longer_2': {
                    'type': 'pattern',
                    'pattern': '[a-zA-Z_0-9\u0400-\u04FF]{2,}',
                    'group': 0
                }
            },
            'filter': {
                'russian_snow': {
                    'type': 'snowball',
                    'language': 'russian'
                }
            }
        }
    }
}

In [11]:
recreate_index(raw_index_name, settings)

In [17]:
text = '<div><a>Мое имя!!</a></div>'

analyzer = {
    'char_filter': ['no_html'],
    'tokenizer': 'letter',
    'filter': ['lowercase']
}

check_analyzer(raw_index_name, analyzer, text)

['мое', 'имя']

In [18]:
def create_es_action(index, doc_id, document):
    return {
        '_index': index,
        '_id': doc_id,
        '_source': document
    }

In [19]:
def es_actions_generator(index_name):
    index = 0
    for doc_id in range(10):
        with open(f'../byweb_for_course/byweb.{doc_id}.xml', 'r') as inf:
            lines = inf.readlines()
            for i in tqdm_notebook(range(1, len(lines) - 1, 3)):
                index = index + 1
                page = lines[i] + lines[i + 1] + lines[i + 2]
                doc = {}
                document = bs4.BeautifulSoup(page, "lxml")
                doc['id'] = index
                doc['url'] = base64.b64decode(document.docurl.contents[0]).decode("cp1251")
                doc['contnent'] = base64.b64decode(document.content.contents[0]).decode("cp1251")
                yield create_es_action(index_name, index, doc)

In [15]:
def run_generator(index_name):
    for ok, result in parallel_bulk(es, es_actions_generator(index_name), queue_size=4, thread_count=4, chunk_size=1000):
        if not ok:
            print(result)

In [124]:
recreate_index(raw_index_name, settings)
run_generator(raw_index_name)

HBox(children=(IntProgress(value=0, max=20000), HTML(value='')))




HBox(children=(IntProgress(value=0, max=20000), HTML(value='')))




In [87]:
def search(index, query, *args):
    pretty_print_result(es.search(index=index, body=query, size=20), args)
    # note that size set to 20 just because default value is 10 and we know that we have 12 docs and 10 < 12 < 20
                        
def pretty_print_result(search_result, fields=[]):
    # fields is a list of fields names which we want to be printed
    res = search_result['hits']
    print(f'Total documents: {res["total"]["value"]}')
    for hit in res['hits']:
        print(f'Doc {hit["_id"]}, score is {hit["_score"]}')
        for field in fields:
            print(f'{field}: {hit["_source"][field]}')
                  
def get_doc_by_id(index, doc_id):
    return es.get(index=index, id=doc_id)['_source']

In [126]:
query = {
    'query': {
        'match_all': {}
    }
}

search(raw_index_name, query, 'url')

Total documents: 10000
Doc 29001, score is 1.0
url: http://www.atom.by/index.php?c=3&i=1622&ic=1282
Doc 29002, score is 1.0
url: http://www.atom.by/index.php?c=8&i=2232&ic=789
Doc 29003, score is 1.0
url: http://www.atom.by/index.php?c=8&i=634&ic=2330
Doc 29004, score is 1.0
url: http://www.atom.by/index.php?c=8&i=779&ic=663
Doc 29005, score is 1.0
url: http://www.atom.by/index.php?c=8&i=2232&ic=1736
Doc 29006, score is 1.0
url: http://www.atom.by/index.php?c=3&i=1622&ic=1912
Doc 29007, score is 1.0
url: http://www.atom.by/index.php?c=16&i=126&ic=452
Doc 29008, score is 1.0
url: http://www.atom.by/index.php?c=9&i=915&ic=430
Doc 29009, score is 1.0
url: http://www.atom.by/index.php?c=8&i=74&ic=570
Doc 29010, score is 1.0
url: http://www.atom.by/index.php?c=3&i=193&ic=1912
Doc 29011, score is 1.0
url: http://www.atom.by/index.php?c=8&i=660&ic=3714
Doc 29012, score is 1.0
url: http://www.atom.by/index.php?c=3&i=2179&ic=342
Doc 29013, score is 1.0
url: http://www.atom.by/index.php?c=3&i=21