In [9]:
from file_storage import FileStorage
import keras
from bs4 import BeautifulSoup
import re

In [3]:
file_storage = FileStorage('../filtered_storage')

In [4]:
len(file_storage)

151120

In [15]:
# https://www.quora.com/How-can-I-extract-only-text-data-from-HTML-pages

def get_text(html):
    soup = BeautifulSoup(html)
    data = soup.findAll(text=True)

    def informative(element):
        if element.parent.name in ['style', 'script', '[document]', 'head', 'title']:
            return False
        elif re.match('<!--.*-->', str(element.encode('utf-8'))):
            return False
        elif len(element) < 20:
            return False
        return True

    return [line.strip() for line in data if informative(line)]

In [17]:
BEGINNING = 'https://simple.wikipedia.org'

In [18]:
get_text(file_storage.read(BEGINNING + '/wiki/Germany'))

['From Wikipedia, the free encyclopedia',
 'This page is about the modern country named',
 '.  For other meanings of Germany, see',
 'Federal Republic of Germany',
 'Bundesrepublik Deutschland',
 'Einigkeit und Recht und Freiheit',
 '"Unity and Justice and Freedom"',
 'and national language',
 '7001790000000000000♠',
 '7001110000000000000♠',
 '7000700000000000000♠',
 '7000230000000099999♠',
 '7000350000000000000♠',
 '6999800000000000000♠',
 '6999500000000000000♠',
 '7000170000000000000♠',
 '0.5% Other religions',
 'Frank-Walter Steinmeier',
 'President of the Bundestag',
 'President of the Bundesrat',
 'President of the Federal',
 'Constitutional Court',
 '•\xa030 June 2016 estimate',
 'Federal Republic of Germany',
 'Bundesrepublik Deutschland',
 ". The country's full name is sometimes shortened to the",
 'To the north of Germany are the',
 ', and the kingdom of',
 '. To the east of Germany are the countries of',
 '. To the south of Germany are the countries of',
 '. To the west of Ge

In [None]:
def normalize_str(string):
    return string.lower()


def get_n_gram_counter(n, collection):
    counter = Counter()
    for element in collection:
        element = normalize_str(element)
        if n == 1:
            counter.update(element)
        else:
            counter.update([
                element[ind:ind + n]
                for ind in range(len(element) - n)
            ])
    return counter

In [19]:
class Storage:
    def __init__(self, elements):
        self._ind_to_elem = elements
        self._elem_to_ind = {elem: ind for ind, elem in enumerate(elements)}
        
    def get_elem(self, ind):
        return self._ind_to_elem[ind]
    
    def get_ind(self, elem):
        return self._elem_to_ind.get(elem)
    
    def __len__(self):
        return len(self._ind_to_elem)


class BagOfNgramsEncoder:
    def __init__(self, ngrams_array, max_size):
        self._ngram_storages = [Storage(ngrams) for ngrams in ngrams_array]
        self._code_size = sum(map(len, self._ngram_storages)) + 2
        self._max_size = max_size - self.n + 1
        
    @property
    def max_size(self):
        return self._max_size
    
    @property
    def n(self):
        return len(self._ngram_storages)

    def encode(self, string, max_size=None):
        if max_size is None:
            max_size = self.max_size
        else:
            max_size = max_size - self.n + 1
        string = normalize_str(string)
        code = []
        
        real_len = min(len(string) - self.n + 1, max_size)
        for i in range(real_len):
            ind = 0
            for ngram_len in reversed(range(1, self.n + 1)):
                ngram = string[i:i + ngram_len]
                ngram_ind = self._ngram_storages[ngram_len - 1].get_ind(ngram)
                if ngram_ind is None:
                    ind += len(self._ngram_storages[ngram_len - 1])
                else:
                    ind += ngram_ind
                    break
            code.append(ind)
        code += [self.code_size - 1] * (max_size - real_len)
        return np.array(code)
        
    @property
    def code_size(self):
        return self._code_size

In [None]:
def encode_array(encoder, array, max_size=None):
    max_size = max_size or max(map(len, array))
    return np.array([encoder.encode(x, max_size) for x in array])

In [None]:
most_common_ngrams = []
for n, most_common in zip(range(1, 4), [300, 2200, 7500]):
    counter = get_n_gram_counter(n, X_train)
    most_common = counter.most_common(most_common)
    most_common_ngrams.append([ngram for ngram, _ in most_common])

In [None]:
def get_dense(units, activation='relu'):
    return keras.layers.Dense(
        units, activation=activation, kernel_regularizer=keras.regularizers.l2(0.01),
    )

In [None]:
doc = keras.layers.Input(shape=(None,), dtype='int32')
query = keras.layers.Input(shape=(None,), dtype='int32')

def get_embed(embed_layers, data):
    for layer in embed_layers:
        data = layer(data)
    return data


doc_embed_layers = [
    keras.layers.Embedding(encoder.code_size, 256),
    keras.layers.LSTM(256),
    keras.layers.Dropout(0.2),
    get_dense(256),
    keras.layers.Dropout(0.2),
    get_dense(128, activation=None),
]

query_embed_layers = [
    keras.layers.Embedding(encoder.code_size, 128),
    keras.layers.LSTM(128),
    keras.layers.Dropout(0.2),
    get_dense(128),
    keras.layers.Dropout(0.2),
    get_dense(128, activation=None),
]

doc_embed = get_embed(doc_embed_layers, doc)
query_embed = get_embed(query_embed_layers, query)

similarity = keras.layers.Dot(axes=1, normalize=True)([doc_embed, query_embed])

model = keras.models.Model(
    inputs=[doc, query],
    outputs=similarity,
)