In [1]:
from nltk.corpus import stopwords

In [2]:
import nltk
from nltk.tokenize import word_tokenize

In [47]:
from pymystem3 import Mystem
mystem = Mystem()

In [4]:
from string import punctuation

In [48]:
def preprocessing(input_text, del_stopwords=True, del_digit=True):
    """
    :input: raw text
    1. lowercase, del punctuation, tokenize
    2. normal form
    3. del stopwords
    4. del digits
    :return: lemmas
    """
    russian_stopwords = set(stopwords.words('russian'))
    words = [x.lower().strip(punctuation+'»«–…') for x in word_tokenize(input_text)]
    lemmas = [mystem.lemmatize(x)[0] for x in words if x]

    lemmas_arr = []
    for lemma in lemmas:
        if del_stopwords:
            if lemma in russian_stopwords:
                continue
        if del_digit:
            if lemma.isdigit():
                continue
        lemmas_arr.append(lemma)
    return lemmas_arr

In [6]:
def file_split(file):
    with open(file, 'r', encoding = 'utf-8') as f:
        text = f.read().split('\n')
        info = {}
        info['name'] = text[0]
        info['price'] = text[1]
        info['date'] = text[2]
        info['view'] = text[3]
        info['author'] = text[4]
        info['adress'] = text[5]
        info['text'] = ' '.join(text[6:])
        return info

In [7]:
from collections import Counter
import numpy as np
import math
from math import log
import os

In [8]:
def get_information(path, kind):
    """
    Create inverted index by input doc collection
    Get essential information from input doc collection
    :return: inverted index, 
    """
    dictionary = {}
    files_len = {}
    for file in os.listdir(path):
        try:
            info = file_split(path+'/'+file)
            if kind == 'date' or kind == 'price':
                words = preprocessing(info[kind], del_digit=False)
            else:
                words = preprocessing(info[kind])
                files_len[file] = len(words)
                counter = Counter(words)
                for word in counter:
                    if word in dictionary:
                        dictionary[word][file] = counter[word]
                    else:
                        dictionary[word] = {}
                        dictionary[word][file] = counter[word]
        except:
            pass
    return files_len, dictionary

In [9]:
kinds = ['name', 'author', 'date', 'adress', 'price', 'text', 'view']

In [109]:
index = {}
files_len = {}
for kind in kinds:
    files_len[kind], index[kind] = get_information('avito', kind)

In [11]:
import json

In [12]:
with open('index.txt', 'r', encoding='utf-8') as f:
    index = json.loads(f.read())

In [13]:
with open('files_len.txt', 'r', encoding='utf-8') as f:
    files_len = json.loads(f.read())

In [54]:
import pandas as pd
df = pd.DataFrame(index)

In [14]:
k1 = 2.0
b = 0.75

def score_BM25(qf, dl, avgdl, k1, b, N, n) -> float:
    """
    Compute similarity score between search query and documents from collection
    :return: score
    """
    score = math.log((N-n+0.5)/(n+0.5)) * (k1+1)*qf/(qf+k1*(1-b+b*(dl/avgdl)))
    return score

In [15]:
def compute_sim(word, index, files_len):
    """
    Compute similarity score between search query and documents from collection
    :return: score
    """
    N = len(files_len)
    avgdl = sum(files_len.values())/N
    if word in index:
        n = len(index[word])
        result = {}
        for file in index[word]:
            qf = index[word][file]
            score = score_BM25(qf, files_len[file], avgdl, k1, b, N, n)
            result[file] = score
        return result
    else:
        return {}


def get_search_result(inquiry, kind):
    """
    Compute sim score between search query and all documents in collection
    :return: list of files
    """
    global index, files_len
    score = defaultdict(int)
    if kind == 'date' or kind == 'price':
        words = preprocessing(inquiry, del_digit=False)
    else:
        words = preprocessing(inquiry)
    for word in words:
        result = compute_sim(word, index[kind], files_len[kind])
        for file in result:
            score[file] += result[file]  
        
    return sorted(score, key=score.get, reverse = True)[:10]

In [16]:
from collections import defaultdict


In [17]:
from flask import Flask
from flask import render_template, request
app = Flask(__name__)

In [18]:
from gensim.models import Word2Vec, KeyedVectors

In [19]:
w2v_model = Word2Vec.load('/users/kata/araneum_none_fasttextcbow_300_5_2018/araneum_none_fasttextcbow_300_5_2018.model')

In [20]:
def get_w2v_vectors(text, model): 
    """
    Получает вектор документа
    """
    n = 0
    vector = [0] * 300
    lemmas = preprocessing(text)
    for lemma in lemmas:
        try:
            vector += model.wv[lemma]
            n += 1
        except:
            None
    if n != 0:
        vector = vector / n
    return vector

def save_w2v_base(path, model):
    """Индексирует всю базу для поиска через word2vec"""
    w2v_base = []
    for file in os.listdir(path=path)[1:]:
        info = file_split(path+'/'+file)
        vec = {}
        vec['id'] = file
        vec['text'] = info['text']
        vec['vector'] = get_w2v_vectors(info['text'], model)
        w2v_base.append(vec)
            

    return w2v_base


In [21]:
w2v_base = save_w2v_base('avito', w2v_model)

In [22]:
from gensim.models.doc2vec import Doc2Vec, TaggedDocument


In [23]:
def get_tagged_data(path):
    tagged_data = []
    i = 0
    for file in os.listdir(path=path)[1:]:
        try:
            info = file_split(path+'/'+file)
            data = preprocessing(info['text'], del_stopwords=False)
            tagged_data.append(TaggedDocument(words=data, tags=[i]))
            i+=1
        except:
            None
    return tagged_data

In [24]:
def train_doc2vec(tagged_data):
    model = Doc2Vec(vector_size=100, min_count=5, alpha=0.025, min_alpha=0.025, epochs=100, workers=2, dm=1)
    model.build_vocab(tagged_data)
    model.random.seed(12345)
    model.train(tagged_data, total_examples=model.corpus_count, epochs=model.epochs)
    
    return model

In [25]:
def get_d2v_vectors(text, model):
    """Получает вектор документа"""
    return model.infer_vector(text)

def save_d2v_base(path, model):
    """Индексирует всю базу для поиска через doc2vec"""
    d2v_base = []
    for file in os.listdir(path=path)[1:]:
        info = file_split(path+'/'+file)
        vec = {}
        vec['id'] = file
        vec['text'] = info['text']
        vec['vector'] = get_d2v_vectors(info['text'], model)
        d2v_base.append(vec)
    return d2v_base

In [26]:
d2w_model = train_doc2vec(get_tagged_data('avito'))

In [27]:
d2v_base = save_d2v_base('avito', d2w_model)

In [28]:
from gensim import matutils
import numpy as np 

def similarity(v1, v2):
    v1_norm = matutils.unitvec(np.array(v1))
    v2_norm = matutils.unitvec(np.array(v2))
    return np.dot(v1_norm, v2_norm)

In [42]:
def search_w2v(inquiry):
    global w2v_base, w2v_model
    v1 = get_w2v_vectors(inquiry, w2v_model)
    score = defaultdict(int)
    for vec in w2v_base:
        v2 = vec['vector']
        sim = similarity(v1, v2)
        score[vec['text']] = sim
    return sorted(score, key=score.get, reverse = True)[:10]
        

def search_d2v(inquiry):
    global d2v_base, d2w_model
    v1 = get_d2v_vectors(inquiry, d2w_model)
    score = defaultdict(int)
    for vec in d2v_base:
        v2 = vec['vector']
        sim = similarity(v1, v2)
        score[vec['text']] = sim
    return sorted(score, key=score.get, reverse = True)[:10]

In [30]:
search_d2v('книга')

  if np.issubdtype(vec.dtype, np.int):


['letopisi_1148732967.txt',
 'vvedenie_v_professiyu_psiholog_975252123.txt',
 'skazka_kolobok_kniga-panorama_1413374738.txt',
 'herluf_bidstrup_1505746233.txt',
 'duel_s_odnim_pistoletom_smirenskiy_mihail_1175100792.txt',
 'fables_9sons_of_empire_1705480580.txt',
 'shkola_risovaniya_1635959296.txt',
 'knigizhzl_1711397912.txt',
 'karmannyy_atlas_sssr_1083085053.txt',
 'kniga_1688439272.txt']

In [31]:
search_w2v('книга')

  if np.issubdtype(vec.dtype, np.int):


['voronin_neznanskiy_shitov_topol_890410197.txt',
 'aleksandr_zorich_6_knig_985215302.txt',
 'aferisty_a_malyugin_1161825982.txt',
 'detskie_knigi_1718632671.txt',
 'evropeyskoe_iskusstvo_xlx_veka_1140898144.txt',
 'georgiy_sytin_1422733275.txt',
 'kniga_bud_yasnym_den_a_korneev_1322027375.txt',
 'kniga_chernaya_zhemchuzhina_viktor_vazhdaev_1610234720.txt',
 'kniga_timka_boris_raevskiy_1657827033.txt',
 'knigi_1425913755.txt']

In [None]:
@app.route('/poisk2')
def poisk4():
    if request.args:
        inquiry = request.args['word']
        kind = request.args['kind']
        search_method = request.args['search_method']
        if search_method == 'inverted_index':
            b = get_search_result(inquiry, kind)
        elif search_method == 'word2vec':
            b = search_w2v(inquiry)
        else:
            b = search_d2v(inquiry)
        if b == []:
            b.append('Извините, по вашему запросу ничего не найдено. Попробуйте ещё!')
        return render_template('result1.html', b=b)
    return render_template('search1.html')

if __name__ == "__main__":
    app.run()

 * Running on http://127.0.0.1:5000/ (Press CTRL+C to quit)
127.0.0.1 - - [23/Oct/2018 15:20:44] "GET / HTTP/1.1" 200 -
127.0.0.1 - - [23/Oct/2018 15:20:53] "GET /poisk2 HTTP/1.1" 200 -
  if np.issubdtype(vec.dtype, np.int):
127.0.0.1 - - [23/Oct/2018 15:21:04] "GET /poisk2?word=%D0%BB%D0%B5%D1%80%D0%BC%D0%BE%D0%BD%D1%82%D0%BE%D0%B2&kind=text&search_method=word2vec HTTP/1.1" 200 -
