Infosearch plan:
    + get corpus from web
    + inverted index
    + ранжирование
    easy flask app for search engine

In [56]:
import xgboost as xgb
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression

#    ulimit -n 2048
from collections import Counter, defaultdict
import json, os, re, string, operator
from pymystem3 import Mystem
from math import log

def reverse_id(collection):
    dic = defaultdict(set) # wont create key duplicates
    for i, doc in enumerate(collection):
        for item in doc:
            dic[item].add(i)
    return dic

def lemmatize_collection(path):

    all_files = os.listdir(path)
    text_id_list = {}
    all_texts = []
    m = Mystem()
    j = 0   

    for root, dirs, files in os.walk(path):
        for file_name in files:
            if file_name.endswith(".txt"):
                
                full_file_path = os.path.join(root, file_name)

                fh = open(full_file_path, 'r')              
                content = fh.readlines()
                text = content[5]
                text = re.sub('\n', ' ', text)
                text = re.sub(r'[\.\?\!…]([А-Я])', r'. \1', text)
                text = re.sub(r'[^\w\s]', '', text)
                text = re.sub('  ', ' ', text)
                text = text.lower()

                lemmas = m.lemmatize(text)
                lemma_text = ''.join(lemmas)
                lemma_text = re.sub('  ', ' ', lemma_text)

                words = lemma_text.split()
                all_texts.append(words)
                text_id_list[j] = file_name
                j += 1 

                fh.close()
            
    return [all_texts, text_id_list]

#f1 = open('/Users/Sofia/Desktop/inza-vpered/text_ids.txt', 'w')
#f2 = open('/Users/Sofia/Desktop/inza-vpered/inverse_ids.txt', 'w')

#for k, v in text_ids.items():
#    f1.write("%s - %s" % (str(k), str(v)))
#    f1.write('\n')
    
#for k, v in reverse_ids.items():
#    f2.write("%s - %s" % (str(k), str(v)))
#    f2.write('\n')

#f1.close()
#f2.close()

# ----------------------------- query preparation -----------------------------
def prepare_query(query): # lowercase, strip punct, lemmatize, remove stop-words
    
    stop_words = []
    
    f3 = open('/Users/Sofia/Desktop/inza-vpered/stop_words.txt') # machine-specific
    new_ones = [line.rstrip('\n') for line in f3]
    for new in new_ones:
        if new not in stop_words:
            stop_words.append(new)
            
    m = Mystem()
    query = query.lower()
    query = re.sub(r'[^\w\s]', '', query)

    q_lemmas = m.lemmatize(query)
    q_lemma_text = ''.join(q_lemmas)
    query_words = q_lemma_text.split()
    final_query = []

    for word in query_words:
        z = 0
        for stop in stop_words:
            if word == stop:
                z += 1

        if z == 0:
            final_query.append(word)
            
    return final_query


# ----------------------------- BM 25 -----------------------------
def compute_avgdl(all_texts): # avgdl
    N = len(all_texts)
    sum_text_len = 0
    for text in all_texts:
        sum_text_len += len(text)
        
    avgdl = sum_text_len / N
    avgdl = round(avgdl, 0)    
    return avgdl

def compute_n(q, reverse_ids):    # n = n(qi) - num of texts that have qi (qi - word in query)
    if q in reverse_ids:
        return len(reverse_ids[q])  
    else:
        return 0
    
def compute_fq(q, text): # fq - frequency of word qi in document
    fq = 0
    for word in text:
        if q == word:
            fq += 1      
    return fq

def compute_K(dl, avdl):
    k1 = 2.0
    b = 0.75
    return k1 * ((1-b) + b * (float(dl)/float(avdl)))            

def score_BM25(n, fq, N, dl, avgdl): # computes BM25 for 1 query word and 1 document
    
    k1 = 2.0
    K = compute_K(dl, avgdl)
    IDF = log((N - n + 0.5) / (n + 0.5))
    frac = ((k1 + 1) * fq) / (K + fq)
    return IDF * frac


# ------------- find document matches (has at least 1 word from query) ------------- 
def match_docs(query, reverse_ids): # find document matches (has at least 1 word from query)
    matching_docs = []
    for word in final_query:
        if word in reverse_ids:
            matches = reverse_ids[word]
            for i in matches:
                if i not in matching_docs:
                    matching_docs.append(i)
    
    return matching_docs


# ------------- lemmatized text -> actual text + link ------------- 
def get_actual_text(reverse_text_id, text_ids):
    get_text = text_ids[reverse_text_id]
    f4 = open('/Users/Sofia/Desktop/inza-vpered/texts/' + get_text, 'r') # machine-specific
    content = f4.readlines()
    actual_text = content[5]
    
    link_part = re.sub('.txt', '', get_text)
    link = 'http://inza-vpered.ru/article/' + link_part
    f4.close()
    
    return (actual_text, link)

# ------------- compute Okapi BM25 -------------
def get_BM25(text, final_query, all_texts): # for 1 text
                                      
    k1 = 2.0
    b = 0.75
    
    N = len(all_texts)
    avgdl = compute_avgdl(all_texts)
    dl = len(text)

    sum_score = 0
    
    for q in final_query: # for each word in query
        n = compute_n(q, reverse_ids) # num of texts that have qi (qi - word in query)
        fq = compute_fq(q, text) # frequency of word q in document
        score = score_BM25(n, fq, N, dl, avgdl) # computes BM25 for 1 query word and 1 document        
        sum_score += score

    return sum_score

# ------------- final sorting function ------------- 
def BM25_sort(query, path_to_collection, texts_and_ids, reverse_ids):
    
    #texts_and_ids = lemmatize_collection(path)
    all_texts = texts_and_ids[0]
    text_ids = texts_and_ids[1]

    #reverse_ids = reverse_id(all_texts)    
    final_query = prepare_query(query)
    
    matches = match_docs(final_query, reverse_ids)
    docs_and_scores = {}
    
    for match in matches: # match -- id
        text = all_texts[match]
        score = get_BM25(text, final_query, all_texts)
        
        actual_text = get_actual_text(match, text_ids)
        docs_and_scores[actual_text] = score
        
    sorted_matches = sorted(docs_and_scores.items(), key=operator.itemgetter(1), reverse=True)
    
    # return only texts
    
    return sorted_matches
    
path = '/Users/Sofia/Desktop/inza-vpered/texts' # machine-specific
texts_and_ids = lemmatize_collection(path)
reverse_ids = reverse_id(all_texts)

In [82]:
query = 'каникулы на новый год и рождество'

sorted_matches = BM25_sort(query, path, texts_and_ids, reverse_ids)

for match in sorted_matches[0:10]:
    print(str(match[1]))
    print(match[0][1])
    print('\n')

#f5 = open('/Users/Sofia/Desktop/results.txt', 'w')
#for text in sorted_matches:
#    f5.write(str(text[1]))
#    f5.write(text[0][1])
#    f5.write(text[0][0])
#    f5.write('\n')
#f5.close()

5.85562869604411
http://inza-vpered.ru/article/59687


5.671064209516752
http://inza-vpered.ru/article/116767


5.490277707525085
http://inza-vpered.ru/article/74388


4.62715638498682
http://inza-vpered.ru/article/47824


4.3675683689987785
http://inza-vpered.ru/article/47061


4.301678404413293
http://inza-vpered.ru/article/1179


4.093279754782544
http://inza-vpered.ru/article/91903


3.6822646319557464
http://inza-vpered.ru/article/50579


3.6617760423097274
http://inza-vpered.ru/article/111269


2.9413238068199026
http://inza-vpered.ru/article/40323


