In [61]:
from file_storage import FileStorage
import keras
from bs4 import BeautifulSoup
import re
import transliterate
import tqdm
import unicodedata
from collections import Counter

In [87]:
TRANSLITERATE_DICT = {
    'а': 'a',
    'б': 'b',
    'в': 'v',
    'г': 'g',
    'д': 'd',
    'е': 'e',
    'ж': 'zh',
    'з': 'z',
    'и': 'i',
    'к': 'k',
    'л': 'l',
    'м': 'm',
    'н': 'n',
    'о': 'o',
    'п': 'p',
    'р': 'r',
    'с': 's',
    'т': 't',
    'ф': 'f',
    'х': 'h',
    'ц': 'ts',
    'ч': 'ch',
    'ш': 'sh',
    'щ': 'sch',
    'ъ': "'",
    'ы': 'y',
    'ь': "'",
    'э': 'e',
    'ю': 'ju',
    'я': 'ya',
    'π': 'pi',
    'ı': 'i',
    'ə': 'e',
    'ل': 'j',
    'ƒ': 'f',
    'ﬁ': 'fi',
    '\xad': '-',
    'µ': 'mu',
    '\u200b': ' ',
    'ː': ':',
    '—': '-',
    '−': '-',
    '–': '-',
    '”': '"',
    '“': '"',
    '«': '"',
    '»': '"',
    'у': 'y',
    '’': '"',
    '‘': '"',
    '`': "'",
    '„': '"',
    '·': ',',
    '•': ',',
    '…': ' ',
    # https://www.redhat.com/archives/fedora-extras-commits/2007-June/msg03617.html
    "\u0621": "'", # hamza-on-the-line
    "\u0622": "|", # madda
    "\u0623": ">", # hamza-on-'alif
    "\u0624": "&", # hamza-on-waaw
    "\u0625": "<", # hamza-under-'alif
    "\u0626": "}", # hamza-on-yaa'
    "\u0627": "A", # bare 'alif
    "\u0628": "b", # baa'
    "\u0629": "p", # taa' marbuuTa
    "\u062A": "t", # taa'
    "\u062B": "v", # thaa'
    "\u062C": "j", # jiim
    "\u062D": "H", # Haa'
    "\u062E": "x", # khaa'
    "\u062F": "d", # daal
    "\u0630": "*", # dhaal
    "\u0631": "r", # raa'
    "\u0632": "z", # zaay
    "\u0633": "s", # siin
    "\u0634": "$", # shiin
    "\u0635": "S", # Saad
    "\u0636": "D", # Daad
    "\u0637": "T", # Taa'
    "\u0638": "Z", # Zaa' (DHaa')
    "\u0639": "E", # cayn
    "\u063A": "g", # ghayn
    "\u0640": "_", # taTwiil
    "\u0641": "f", # faa'
    "\u0642": "q", # qaaf
    "\u0643": "k", # kaaf
    "\u0644": "l", # laam
    "\u0645": "m", # miim
    "\u0646": "n", # nuun
    "\u0647": "h", # haa'
    "\u0648": "w", # waaw
    "\u0649": "Y", # 'alif maqSuura
    "\u064A": "y", # yaa'
    "\u064B": "F", # fatHatayn
    "\u064C": "N", # Dammatayn
    "\u064D": "K", # kasratayn
    "\u064E": "a", # fatHa
    "\u064F": "u", # Damma
    "\u0650": "i", # kasra
    "\u0651": "~", # shaddah
    "\u0652": "o", # sukuun
    "\u0670": "`", # dagger 'alif
    "\u0671": "{", # waSlaﬁ
}

In [90]:
# https://stackoverflow.com/questions/34753821/remove-diacritics-from-string-for-search-function

def shave_marks(txt):
    """This method removes all diacritic marks from the given string"""
    norm_txt = unicodedata.normalize('NFD', txt)
    shaved = ''.join(c for c in norm_txt if not unicodedata.combining(c))
    return unicodedata.normalize('NFC', shaved)


def is_english_letters(string):
    return re.search(r'[^a-zA-Z0-9°_©®™;§,№!#@.×:+=()/£¥€$|<>~{}\\\[\]%&*^?"\'-]', string) is None


def try_transliterate(query):
    query = unicodedata.normalize('NFC', shave_marks(query).lower())
    try:
        return transliterate.translit(query, reversed=True)
    except transliterate.exceptions.LanguageDetectionError as query_error:
        transliteration = []
        for word in query.split():
            if is_english_letters(word):
                transliteration.append(word)
            else:
                try:
                    transliteration.append(transliterate.translit(word, reversed=True))
                except transliterate.exceptions.LanguageDetectionError as e:
                    new_word = []
                    for ch in word:
                        translited_ch = TRANSLITERATE_DICT.get(ch, ch)
                        new_word.append(translited_ch)
                    transliteration.append(''.join(new_word))
        return ' '.join(transliteration) 

In [91]:
with open('req_ans_learn.tsv', encoding='utf-8-sig') as train_file, open('transliterated_learn.tsv', 'w') as transliterated_learn_file:
    for line in tqdm.tqdm(train_file):
        query, url_end = line.strip().split('\t')
        transliteration = try_transliterate(query)
        transliterated_learn_file.write(transliteration + '\t' + url_end + '\n')

500000it [00:55, 9059.98it/s] 


In [28]:
file_storage = FileStorage('../filtered_storage')

In [29]:
len(file_storage)

151120

In [34]:
# https://www.quora.com/How-can-I-extract-only-text-data-from-HTML-pages

def get_text(html):
    soup = BeautifulSoup(html)
    data = soup.findAll(text=True)

    def informative(element):
        if element.parent.name in ['style', 'script', '[document]', 'head', 'title']:
            return False
        elif re.match('<!--.*-->', str(element.encode('utf-8'))):
            return False
        elif len(element) < 20:
            return False
        return True

    return [line.split() for line in data if informative(line)]

In [32]:
BEGINNING = 'https://simple.wikipedia.org'

In [37]:
get_text(file_storage.read(BEGINNING + '/wiki/Germany'))

In [None]:
class 

In [None]:
def normalize_str(string):
    return string.lower()


def get_n_gram_counter(n, collection):
    counter = Counter()
    for element in collection:
        element = normalize_str(element)
        if n == 1:
            counter.update(element)
        else:
            counter.update([
                element[ind:ind + n]
                for ind in range(len(element) - n)
            ])
    return counter

In [19]:
class Storage:
    def __init__(self, elements):
        self._ind_to_elem = elements
        self._elem_to_ind = {elem: ind for ind, elem in enumerate(elements)}
        
    def get_elem(self, ind):
        return self._ind_to_elem[ind]
    
    def get_ind(self, elem):
        return self._elem_to_ind.get(elem)
    
    def __len__(self):
        return len(self._ind_to_elem)


class BagOfNgramsEncoder:
    def __init__(self, ngrams_array, max_size):
        self._ngram_storages = [Storage(ngrams) for ngrams in ngrams_array]
        self._code_size = sum(map(len, self._ngram_storages)) + 2
        self._max_size = max_size - self.n + 1
        
    @property
    def max_size(self):
        return self._max_size
    
    @property
    def n(self):
        return len(self._ngram_storages)

    def encode(self, string, max_size=None):
        if max_size is None:
            max_size = self.max_size
        else:
            max_size = max_size - self.n + 1
        string = normalize_str(string)
        code = []
        
        real_len = min(len(string) - self.n + 1, max_size)
        for i in range(real_len):
            ind = 0
            for ngram_len in reversed(range(1, self.n + 1)):
                ngram = string[i:i + ngram_len]
                ngram_ind = self._ngram_storages[ngram_len - 1].get_ind(ngram)
                if ngram_ind is None:
                    ind += len(self._ngram_storages[ngram_len - 1])
                else:
                    ind += ngram_ind
                    break
            code.append(ind)
        code += [self.code_size - 1] * (max_size - real_len)
        return np.array(code)
        
    @property
    def code_size(self):
        return self._code_size

In [None]:
def encode_array(encoder, array, max_size=None):
    max_size = max_size or max(map(len, array))
    return np.array([encoder.encode(x, max_size) for x in array])

In [None]:
most_common_ngrams = []
for n, most_common in zip(range(1, 4), [300, 2200, 7500]):
    counter = get_n_gram_counter(n, X_train)
    most_common = counter.most_common(most_common)
    most_common_ngrams.append([ngram for ngram, _ in most_common])

In [None]:
def get_dense(units, activation='relu'):
    return keras.layers.Dense(
        units, activation=activation, kernel_regularizer=keras.regularizers.l2(0.01),
    )

In [None]:
doc = keras.layers.Input(shape=(None,), dtype='int32')
query = keras.layers.Input(shape=(None,), dtype='int32')

def get_embed(embed_layers, data):
    for layer in embed_layers:
        data = layer(data)
    return data


doc_embed_layers = [
    keras.layers.Embedding(encoder.code_size, 256),
    keras.layers.LSTM(256),
    keras.layers.Dropout(0.2),
    get_dense(256),
    keras.layers.Dropout(0.2),
    get_dense(128, activation=None),
]

query_embed_layers = [
    keras.layers.Embedding(encoder.code_size, 128),
    keras.layers.LSTM(128),
    keras.layers.Dropout(0.2),
    get_dense(128),
    keras.layers.Dropout(0.2),
    get_dense(128, activation=None),
]

doc_embed = get_embed(doc_embed_layers, doc)
query_embed = get_embed(query_embed_layers, query)

similarity = keras.layers.Dot(axes=1, normalize=True)([doc_embed, query_embed])

model = keras.models.Model(
    inputs=[doc, query],
    outputs=similarity,
)