In [91]:
%load_ext autoreload
%autoreload 2

import pandas as pd
import numpy as np
import string
import jamspell
import matplotlib.pyplot as plt
from dataset_prepare import ToStrLower, DropDuplicates, FilterByQuantile
from autocompleter import AutoCompleterBaseline, AutoCompleteBySuffix, AutoCompleteByInvertedIndex
from inverted_index import InvertedIndex
from tries import make_trie

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [72]:
data = pd.read_csv("data/s_hist_mil_preprocessed.csv")

## Filter data

In [73]:
# filter out duplicates by wbuser_id and UQ
subset_drop = ["wbuser_id", "UQ"]
drop_duplicates_filter = DropDuplicates(subset_drop)
df = drop_duplicates_filter.apply(data)

DropDuplicates: dataset size 910835 -> 870524


In [74]:
UPPER_QUANTILE = 0.95
LOWER_QUANTILE = 0
quantile_filter = FilterByQuantile('cnt', UPPER_QUANTILE)
df = quantile_filter.apply(df)

Lower quantile: 0. Upper quantile: 32576.849999999977
FilterByQuantile: dataset size 870524 -> 826997


In [66]:
popularity = df.groupby("UQ").size().reset_index(name='popularity')

In [81]:
popularity['UQ'].values

array(['!', '! babylight', '! bag', ..., '😹', '🥑', '🦄'], dtype=object)

### Create inverted index

In [77]:
query_popularity_corpus = zip(popularity['UQ'], popularity['popularity'])

In [78]:
inverted_index = InvertedIndex()
index = inverted_index.process_corpus(query_popularity_corpus)

### Create tree

In [82]:
# tries 
trie = make_trie(popularity['UQ'].values)

Making database for queries... It length: 511138


In [89]:
popularity['UQ']

0                                !
1                      ! babylight
2                            ! bag
3                          ! flyly
4         ! ковририк в багажник на
                    ...           
511133                           😆
511134                   😊dualfix2
511135                           😹
511136                           🥑
511137                           🦄
Name: UQ, Length: 511138, dtype: object

In [None]:
"ru_small.bin"

In [83]:
autocomplete_by_inverted_index = AutoCompleteByInvertedIndex(index, trie, first_prefix=5, max_candidates=50)

In [90]:
autocomplete_by_inverted_index.query("baby", max_n=10)

[Query indx: 10633. Words: baby go одежда. Score: 23,
 Query indx: 10698. Words: babygo. Score: 9,
 Query indx: 10625. Words: baby go игрушки. Score: 6,
 Query indx: 10636. Words: baby go подгузники. Score: 4,
 Query indx: 10607. Words: baby born. Score: 3,
 Query indx: 16837. Words: charon baby. Score: 51,
 Query indx: 16838. Words: charon baby pod. Score: 9,
 Query indx: 221511. Words: кормление happy baby. Score: 6,
 Query indx: 213483. Words: комбинезон baby go. Score: 6,
 Query indx: 64309. Words: smoant charon baby. Score: 6]

In [93]:
corrector = jamspell.TSpellCorrector()
corrector.LoadLangModel('data/ru_small.bin')

True

In [95]:
corrector.GetCandidates(['польто'], 0)

('пальто', 'польто', 'польцо', 'полито', 'дольто')

In [96]:
corrector.GetCandidatesWithScores(['польто'], 0)

(('пальто', -55.60556954662905),
 ('польто', -56.89898564302661),
 ('польцо', -58.85446332317185),
 ('полито', -58.85446332317185),
 ('дольто', -58.85446332317185))

In [97]:
corrector.GetCandidatesWithScores(['пальто'], 0)

(('пальто', -50.60556954662905),
 ('сальто', -71.50082423806334),
 ('альто', -71.81758222099103),
 ('пальцо', -73.85446332317184))