In [8]:
from flask import Flask, render_template, request, redirect, url_for
import json
import pickle
import pymorphy2
from judicial_splitter import splitter as sp
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import string
import os
from math import log

app = Flask(__name__)

morph = pymorphy2.MorphAnalyzer()

with open('inv_base.pkl', 'rb') as f:
    inv_base = pickle.load(f)

def preprocessing(input_text, del_stopwords=True, del_digit=True):
    """
    :input: raw text
        1. lowercase, del punctuation, tokenize
        2. normal form
        3. del stopwords
        4. del digits
    :return: lemmas
    """
    russian_stopwords = set(stopwords.words('russian'))
    words = [x.lower().strip(string.punctuation+'»«–…') for x in word_tokenize(input_text)]
    lemmas = [morph.parse(x)[0].normal_form for x in words if x]

    lemmas_arr = []
    for lemma in lemmas:
        if del_stopwords:
            if lemma in russian_stopwords:
                continue
        if del_digit:
            if lemma.isdigit():
                continue
        lemmas_arr.append(lemma)
    return lemmas_arr

class IterDocs(object):
    def __init__(self, text=False, lemmas=False, tagged=False):
        self.text = text
        self.lemmas = lemmas
        self.tagged = tagged
        
    def __iter__(self):
        for root, dirs, files in os.walk('./avito_parsed'):
            for i, file in enumerate(files):
                with open(os.path.join(root, file), 'r', encoding='utf-8') as f:
                    if self.tagged is True:
                        yield TaggedDocument(words=json.load(f), tags=[i])
                    elif self.text is True: 
                        yield ' '.join(json.load(f))
                    elif self.lemmas is True:
                        yield json.load(f)

def score_BM25(qf, dl, avgdl, k1, b, N, n) -> float:
    """
    Compute similarity score between search query and documents from collection
    :return: score
    """
    idf = log((N - n + 0.5) / (n + 0.5))
    score = (idf * (k1 + 1) * qf) / (qf + k1 * (1 - b + b * dl / avgdl))
        
    return score

def compute_sim(query, doc, inv_index, k1, b, avgdl, N) -> float:
    """
    Compute parameters for BM25 score and pass them to the calculation function
    :param query: str: word for which to claculate BM25
    :param doc: str: doc for which to claculate BM25
    :param inv_index: default_dict: inverted index for the collection, that includes doc
    :return: score
    """
    qf = doc.count(query)
    dl = len(doc)
    
    if query in inv_index:
        n = len(inv_index[query])
    else:
        n = 0
     
    return score_BM25(qf, dl, avgdl, k1, b, N, n)

def search_inv(query, corpus, inv_index) -> list:
    """
    Search documents relative to query using inverted index algorithm.
    :param query: str: input text
    :param questions: list: all questions from corpus
    :param answers: list: all answers from corpus
    :param inv_index: list: questions inverted index
    :return: list: 5 relevant answers
    """
    def mean(numbers):
        return float(sum(numbers)) / max(len(numbers), 1)

    k1 = 2.0
    b = 0.75
    file_lens = [len(file) for file in IterDocs(lemmas=True)]
    avgdl = mean(file_lens)
    N = len(file_lens)

    
    query_list = preprocessing(query)
    scores = list()
    
    for i, doc in enumerate(corpus):
        score = 0
        for word in query_list:
            score += compute_sim(word, doc, inv_index, k1, b, avgdl, N)
        scores.append([i, score])
        
    ranked = sorted(scores, key = lambda x: x[1], reverse=True)
    
    result = list()
    names = list()
    i = 0
    while len(result) < 5:
        doc = ranked[i]
        name = os.listdir('./avito_parsed')[doc[0]][:-7]
        if name[-1] is '_':
            name = name[:-1]
        name += '.txt'

        if not name in names:
            names.append(name)            
            with open('./avito_texts/%s' %(name), 'r', encoding='utf-8') as f:
                result.append(f.read())
        i += 1

    return result

def search(query, search_method):
    if search_method == 'inverted_index':
        search_result = search_inv(query, IterDocs(lemmas=True), inv_base)
    elif search_method == 'word2vec':
        search_result = search_w2v(query, w2v_base)
    elif search_method == 'doc2vec':
        search_result = search_d2v(query, d2v_base)
    else:
        raise TypeError('unsupported search method')
    return search_result

@app.route('/')
def search_fucntion():
    if request.args:
        query = rquest.args['query']
        return render_template('results.html')
    else:
        return render_template('index.html')
    
if __name__ == '__main__':
    app.run(debug = True)


 * Running on http://127.0.0.1:5000/ (Press CTRL+C to quit)
 * Restarting with stat


SystemExit: 1

  warn("To exit: use 'exit', 'quit', or Ctrl-D.", stacklevel=1)


In [4]:
morph = pymorphy2.MorphAnalyzer()

with open('inv_base.pkl', 'rb') as f:
    inv_base = pickle.load(f)

def preprocessing(input_text, del_stopwords=True, del_digit=True):
    """
    :input: raw text
        1. lowercase, del punctuation, tokenize
        2. normal form
        3. del stopwords
        4. del digits
    :return: lemmas
    """
    russian_stopwords = set(stopwords.words('russian'))
    words = [x.lower().strip(string.punctuation+'»«–…') for x in word_tokenize(input_text)]
    lemmas = [morph.parse(x)[0].normal_form for x in words if x]

    lemmas_arr = []
    for lemma in lemmas:
        if del_stopwords:
            if lemma in russian_stopwords:
                continue
        if del_digit:
            if lemma.isdigit():
                continue
        lemmas_arr.append(lemma)
    return lemmas_arr

class IterDocs(object):
    def __init__(self, text=False, lemmas=False, tagged=False):
        self.text = text
        self.lemmas = lemmas
        self.tagged = tagged
        
    def __iter__(self):
        for root, dirs, files in os.walk('./avito_parsed'):
            for i, file in enumerate(files):
                with open(os.path.join(root, file), 'r', encoding='utf-8') as f:
                    if self.tagged is True:
                        yield TaggedDocument(words=json.load(f), tags=[i])
                    elif self.text is True: 
                        yield ' '.join(json.load(f))
                    elif self.lemmas is True:
                        yield json.load(f)

def score_BM25(qf, dl, avgdl, k1, b, N, n) -> float:
    """
    Compute similarity score between search query and documents from collection
    :return: score
    """
    idf = log((N - n + 0.5) / (n + 0.5))
    score = (idf * (k1 + 1) * qf) / (qf + k1 * (1 - b + b * dl / avgdl))
        
    return score

def compute_sim(query, doc, inv_index, k1, b, avgdl, N) -> float:
    """
    Compute parameters for BM25 score and pass them to the calculation function
    :param query: str: word for which to claculate BM25
    :param doc: str: doc for which to claculate BM25
    :param inv_index: default_dict: inverted index for the collection, that includes doc
    :return: score
    """
    qf = doc.count(query)
    dl = len(doc)
    
    if query in inv_index:
        n = len(inv_index[query])
    else:
        n = 0
     
    return score_BM25(qf, dl, avgdl, k1, b, N, n)

def search_inv(query, corpus, inv_index) -> list:
    """
    Search documents relative to query using inverted index algorithm.
    :param query: str: input text
    :param questions: list: all questions from corpus
    :param answers: list: all answers from corpus
    :param inv_index: list: questions inverted index
    :return: list: 5 relevant answers
    """
    def mean(numbers):
        return float(sum(numbers)) / max(len(numbers), 1)

    k1 = 2.0
    b = 0.75
    file_lens = [len(file) for file in IterDocs(lemmas=True)]
    avgdl = mean(file_lens)
    N = len(file_lens)

    
    query_list = preprocessing(query)
    scores = list()
    
    for i, doc in enumerate(corpus):
        score = 0
        for word in query_list:
            score += compute_sim(word, doc, inv_index, k1, b, avgdl, N)
        scores.append([i, score])
        
    ranked = sorted(scores, key = lambda x: x[1], reverse=True)
    
    result = list()
    names = list()
    i = 0
    while len(result) < 5:
        doc = ranked[i]
        name = os.listdir('./avito_parsed')[doc[0]][:-7]
        if name[-1] is '_':
            name = name[:-1]
        name += '.txt'

        if not name in names:
            names.append(name)            
            with open('./avito_texts/%s' %(name), 'r', encoding='utf-8') as f:
                result.append(f.read())
        i += 1

    return result

def search(query, search_method):
    if search_method == 'inverted_index':
        search_result = search_inv(query, IterDocs(lemmas=True), inv_base)
    elif search_method == 'word2vec':
        search_result = search_w2v(query, w2v_base)
    elif search_method == 'doc2vec':
        search_result = search_d2v(query, d2v_base)
    else:
        raise TypeError('unsupported search method')
    return search_result

SystemExit: 1