In [226]:
%load_ext autoreload
%autoreload 2

import pandas as pd
import numpy as np
import string
import jamspell
import matplotlib.pyplot as plt
from dataset_prepare import ToStrLower, DropDuplicates, FilterByQuantile
from autocompleter import AutoCompleterBaseline, AutoCompleteBySuffix, AutoCompleteByInvertedIndex
from inverted_index import InvertedIndex
from tries import make_trie

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [227]:
data = pd.read_csv("data/s_hist_mil_preprocessed.csv")

## Filter data

In [228]:
# filter out duplicates by wbuser_id and UQ
subset_drop = ["wbuser_id", "UQ"]
drop_duplicates_filter = DropDuplicates(subset_drop)
df = drop_duplicates_filter.apply(data)

DropDuplicates: dataset size 910835 -> 870524


In [229]:
UPPER_QUANTILE = 0.95
LOWER_QUANTILE = 0
quantile_filter = FilterByQuantile('cnt', UPPER_QUANTILE)
df = quantile_filter.apply(df)

Lower quantile: 0. Upper quantile: 32576.849999999977
FilterByQuantile: dataset size 870524 -> 826997


In [230]:
popularity = df.groupby("UQ").size().reset_index(name='popularity')

In [231]:
# popularity['popularity'] = popularity['popularity'] / popularity['popularity'].max()

In [270]:
popularity[popularity['UQ'].str.startswith("кашемир")]

Unnamed: 0,UQ,popularity
197892,кашемир,18
197893,кашемир - шоп,1
197894,кашемир 100,4
197895,кашемир 100%,2
197896,кашемир 80%,1
...,...,...
197974,кашемировый свитер женский,3
197975,кашемировый свитер мужской,1
197976,кашемировый свитер оверсайз,1
197977,кашемировый шарф,6


### Create inverted index

In [279]:
query_popularity_corpus = zip(popularity['UQ'], popularity['popularity'])

In [280]:
inverted_index = InvertedIndex()
index = inverted_index.process_corpus(query_popularity_corpus)

In [281]:
# index['кашемир']

### Create tree

In [282]:
trie = make_trie(popularity['UQ'].values)

Making database for queries... It length: 511138


### Create spellchecker

In [283]:
corrector = jamspell.TSpellCorrector()
corrector.LoadLangModel('data/jamspell_ru_model_subtitles.bin')

True

### Create autocompleter

In [304]:
autocomplete_by_inverted_index = AutoCompleteByInvertedIndex(index, trie, corrector, queries_score_thr=1, first_prefix=5, max_candidates=50)

In [306]:
autocomplete_by_inverted_index.query("компьютер", max_n=10)

Original query score: 13.373493975903614


[Query indx: 216729. Words: компьютерное кресло. Score: 187,
 Query indx: 216742. Words: компьютерные кресла. Score: 15,
 Query indx: 216764. Words: компьютерный стол. Score: 14,
 Query indx: 216777. Words: компьютерный стул. Score: 9,
 Query indx: 216718. Words: компьютерная мышь. Score: 8,
 Query indx: 234966. Words: кресло компьютерное. Score: 396,
 Query indx: 419758. Words: стол компьютерный. Score: 48,
 Query indx: 286293. Words: мышь компьютерная. Score: 37,
 Query indx: 234793. Words: кресла компьютерные. Score: 34,
 Query indx: 421328. Words: стул компьютерный. Score: 31]

In [126]:
corrector = jamspell.TSpellCorrector()
corrector.LoadLangModel('data/jamspell_ru_model_subtitles.bin')

True

In [267]:
corrector.GetCandidates(['кашимир'], 0)

('кашимир', 'кашемир', 'кашмир')

In [127]:
corrector.GetCandidates(['польто'], 0)

('пальто', 'польто', 'полько', 'мольто')

In [170]:
corrector.FixFragment('польт')

'полет'

In [173]:
corrector.GetCandidatesWithScores(['польто'], 0)[0][1] / 50

-1.1282475238160268

In [165]:
corrector.GetCandidatesWithScores(['ботинок'], 0)

(('ботинок', -52.02608241405659),)

In [163]:
import numpy as np
np.log(56.779535819460015)

4.039175975940258

In [None]:
np.log(-56.779535819460015)