In [1]:
import numpy as np
import pandas as pd
import utils
from collections import Counter
import pymorphy2
import sys

In [2]:
cache = dict()
morph = pymorphy2.MorphAnalyzer()

In [6]:
def norm_word(word):
    if word in cache:
        return cache[word]
    parsed = morph.parse(word)
    if len(parsed) == 0:
        cache[word] = word
        return word
    cache[word] = parsed[0].normal_form    
    return parsed[0].normal_form

In [3]:
queries = utils.load_queries('data/queries_ec.tsv')

In [11]:
unigrams_cnt = Counter()
bigrams_cnt = Counter()
char_trigrams_cnt = Counter()
unigrams_norm_cnt = Counter()
bigrams_norm_cnt = Counter()
char_trigrams_cnt = Counter()
char_trigrams_norm_cnt = Counter()

In [14]:
def get_unigrams(words, norm=False):
    unigrams = []
    for word in words:
        if norm:
            unigrams.append(norm_word(word))
        else:
            unigrams.append(word)
    return unigrams

In [21]:
def get_bigrams(unigrams):
    bigrams = []
    for i in range(0,len(unigrams)-1):
        bigrams.append(unigrams[i]+" "+unigrams[i+1])
    return bigrams

In [22]:
def get_char_trigrams(unigrams):
    char_trigrams = []
    for unigram in unigrams:
        for i in range(0,len(unigram)-2):
            char_trigrams.append(unigram[i:i+3])
    return char_trigrams

In [23]:
for query_id, query in queries.items():
    words = query.split(" ")
    unigrams = get_unigrams(words)
    unigrams_norm = get_unigrams(words, True)
    bigrams = get_bigrams(unigrams)
    bigrams_norm = get_bigrams(unigrams_norm)
    char_trigrams = get_char_trigrams(unigrams)
    char_trigrams_norm = get_char_trigrams(unigrams_norm)
    
    unigrams_cnt.update(unigrams)
    bigrams_cnt.update(bigrams)
    char_trigrams_cnt.update(char_trigrams)
    unigrams_norm_cnt.update(unigrams_norm)
    bigrams_norm_cnt.update(bigrams_norm)
    char_trigrams_norm_cnt.update(char_trigrams_norm)

In [33]:
min_words = {x:v for x,v in unigrams_cnt.items() if v < 2}

In [39]:
len(bigrams_cnt)

18362

In [40]:
utils.save_to_file(unigrams_cnt, "pkls/unigrams_cnt.pkl")
utils.save_to_file(bigrams_cnt, "pkls/bigrams_cnt.pkl")
utils.save_to_file(char_trigrams_cnt, "pkls/char_trigrams_cnt.pkl")
utils.save_to_file(unigrams_norm_cnt, "pkls/unigrams_norm_cnt.pkl")
utils.save_to_file(bigrams_norm_cnt, "pkls/bigrams_norm_cnt.pkl")
utils.save_to_file(char_trigrams_norm_cnt, "pkls/char_trigrams_norm_cnt.pkl")

### normalize queries

In [42]:
with open("data/queries_ec.tsv", 'rt') as fd, open("data/queries_norm.tsv", "wt") as outfd:
    for line in fd:
        parts = line.strip().split('\t')
        words = parts[1].split(' ')
        words = [norm_word(x) for x in words]
        print("{}\t{}".format(parts[0]," ".join(words)), file=outfd)