In [None]:
import asyncio
from aioes import Elasticsearch
from elasticsearch import helpers

In [2]:
from chunk import TimeDistance
from chunk import Chunker
from slack_data_loader import SlackLoader

In [3]:
import json

In [4]:
PATH_TO_DATA = './data'

In [15]:
es = Elasticsearch(['localhost:9200'])

In [None]:
def gen_synonyms():
    """
    Generate some synonyms in a file. All words separated by comma are treated as equal
    """
    with open("help_data/synonyms.txt", "w") as syns:
        syns.write("xboost, эксгебуст, эксбуст, иксгебуст, xgboost\n")
        syns.write("пыха, пыху, пых, php\n")
        syns.write("lol, лол\n")
        syns.write("питон, python\n")

In [20]:
index_name = "ods-slack-index"
mapping_name = "thread"
message_mapping = "message"
index_body = {
    "settings": {
        "analysis": {
          "filter": {
            "russian_stop": {
              "type":       "stop",
              "stopwords":  "_russian_" 
            },
            "russian_stemmer": {
              "type":       "stemmer",
              "language":   "russian"
            },
            "synonyms_expand": {
              "type": "synonym", 
              # path to synonym file.
              # for ES to be able to read it, security policy should be set as described here:
              # https://stackoverflow.com/questions/35401917/reading-a-file-in-an-elasticsearch-plugin
              "synonyms_path": "/usr/share/config_data/synonyms.txt"
            }
          },
          "analyzer": {
            "russian_syn": {
              "tokenizer":  "standard",
              "filter": [
                "lowercase",
                "russian_stop",
                "russian_stemmer",
                "synonyms_expand"
              ]
            }
          }
        }
    },
    "mappings":{  
        mapping_name:{
          "properties":{
            "channel": {"type": "keyword"},
            "title": {"type":"string", "analyzer":"russian_syn"},
            "ts": {"type": "date"},
            "messages" : {
                "properties":{
                    "text": {"type":"string", "analyzer":"russian_syn"},
                    "user_id": {"type": "keyword"},
                    "user_real_name": {"type":"string"},
                    "ts": {"type": "date"}
                }
            }
          }
        },
        message_mapping:{
            "properties":{
                "text": {"type":"string", "analyzer":"russian_syn"},
                "user_id": {"type": "keyword"},
                "user_real_name": {"type":"string"},
                "ts": {"type": "date"}
            }
        }
    }
}

async def create_index():
    return await es.indices.create(
        index=index_name,
        body=index_body
    )
    
async def check_index_exists():
    return await es.indices.exists(index=index_name)


async def delete_index():
    return await es.delete(index=index_name)

async def openclose():
    """
    Closing and opening index again reloads synomyms file
    """
    await es.indices.close(index=index_name)
    await es.indices.open(index=index_name)
    
async def populate_index(channel, messages):
    await es.index(
        index=index_name,
        doc_type=mapping_name,
        body={
            "channel": channel,
            "title": messages[0]['text'],
            "ts": messages[0]['ts'] * 1000,
            "messages": messages
        }
    )
    """
    for message in messages: # make bulk upload here
        await es.index(
            index=index_name,
            doc_type=message_mapping,
            body=message
        )
    """

async def query_index(query):
    return await es.search(
        index=index_name,
        doc_type=mapping_name,
        body={
            "query":{
                "multi_match" : {
                  "fields" : [ "title^3", "messages.text" ],
                  "query": query
                }
            }
        }
    )

async def search_user(username):
    return await es.search(
        index=index_name,
        doc_type=mapping_name,
        body={
            "query":{
                "multi_match" : {
                  "fields" : [ "messages.user_real_name" ],
                  "query": username
                }
            }
        }
    )

In [7]:
loop = asyncio.get_event_loop()

In [16]:
if loop.run_until_complete(check_index_exists()):
    print(loop.run_until_complete(delete_index()))
    
print(loop.run_until_complete(create_index()))

{'acknowledged': True}
{'acknowledged': True, 'shards_acknowledged': True}


In [None]:
# reload synonims without recreating the whole database
gen_synonyms()
loop.run_until_complete(openclose())

In [9]:
async def index_channel(channel = "nlp"):
    data = SlackLoader(PATH_TO_DATA, only_channels=[channel])
    chunker = Chunker()
    groups  = chunker.get_groups(data)
    
    print("Indexing: " + channel)

    workers = []
    for group in groups:
        users = data.users
        for msg in group:
            if msg['user'] in users:
                msg['user_real_name'] = users[msg['user']]['name']
            if 'dt' in msg:
                del msg['dt']
            msg['timestamp'] = str(msg['ts'])
            msg['ts'] = int(msg['ts'])
            if "attachments" in msg:
                for attach in msg["attachments"]:
                    if 'ts' in attach:
                        attach['ts'] = float(attach['ts'])
        workers.append(
            asyncio.ensure_future(populate_index(channel, group))
        )
    return await asyncio.gather(*workers)

async def index_channels(channels):
    await asyncio.gather(
        *[asyncio.ensure_future(index_channel(channel)) for channel in channels]
    )
        

In [17]:
useful_channels = ["nlp", "deep_learning", "datasets",
                  "sequences_series", "bayesian", "_meetings", "edu_academy",
                  "edu_books", "visualization", "hardware",
                  "reinforcement_learnin", "theory_and_practice"]

loop.run_until_complete(index_channels(useful_channels))

Indexing: nlp
Indexing: deep_learning
Indexing: datasets
Indexing: sequences_series
Indexing: bayesian
Indexing: _meetings
Indexing: edu_academy
Indexing: edu_books
Indexing: visualization
Indexing: hardware
Indexing: reinforcement_learnin
Indexing: theory_and_practice


In [None]:
loop.run_until_complete(query_index("как использовать xgboost в python"))

In [57]:
res = loop.run_until_complete(search_user("generall"))['hits']['hits']
for hit in res:
    print("\n------------({})--------------".format(hit['_source']['channel']))
    for msg in hit['_source']['messages']:
        print("{}: {}".format(msg['user_real_name'], msg['text']))


------------(nlp)--------------
elwis: <https://gist.github.com/Kein1945/9111512>
elwis: если надо просто отрезать окончания, то вот это подойдет
stajilov: проверял недавно, не умеет нормально, лемматизацию может делать типо но нужен русский wordnet нормальный

------------(nlp)--------------
elwis: Коллега сделала интересную сравнительную таблицу чатботов, может кому-то пригодится: <https://medium.com/@datamonsters/25-chatbot-platforms-a-comparative-table-aeefc932eaff>
alexantonov: elwis: Отличная статья. А совершенно случайно нет на русском языке?
elwis: <@U32506X36> К сожалению нет

------------(nlp)--------------
elwis: Если tf-idf вектора нормализованные, можно вместо косинусной близости считать скалярное произведение

------------(nlp)--------------
rvnikita: Ребята, привет. Что почиттать чтобы быстро разобраться в основнах NLP? <https://www.amazon.co.uk/d/Books/Natural-Language-Processing-Python-Steven/0596516495> хорошая книжка или есть что-то другое более признаное?
octocat: 