In [1]:
from file_storage import FileStorage
import keras
from bs4 import BeautifulSoup
import re
import transliterate
import tqdm
import unicodedata
from collections import Counter
from math import ceil
import numpy as np
import pandas as pd
import pickle
from scipy.sparse import csr_matrix

Using TensorFlow backend.


In [2]:
file_storage = FileStorage('../filtered_storage')

In [39]:
len(file_storage)

157154

Делаем транслитерацию. Есть библиотека transliterate, используя её будем транслитерировать в несколько стадий:
1. Приводим к нижнему регистру и убираем диакритику
1. Транслитерируем весь запрос библиотекой
2. Транслитерируем пословно библиотекой
3. Транслитерируем с помощью словаря ниже
4. Оставшееся оставляем как есть

In [40]:
TRANSLITERATE_DICT = {
    'а': 'a',
    'б': 'b',
    'в': 'v',
    'г': 'g',
    'д': 'd',
    'е': 'e',
    'ж': 'zh',
    'з': 'z',
    'и': 'i',
    'к': 'k',
    'л': 'l',
    'м': 'm',
    'н': 'n',
    'о': 'o',
    'п': 'p',
    'р': 'r',
    'с': 's',
    'т': 't',
    'ф': 'f',
    'х': 'h',
    'ц': 'ts',
    'ч': 'ch',
    'ш': 'sh',
    'щ': 'sch',
    'ъ': "'",
    'ы': 'y',
    'ь': "'",
    'э': 'e',
    'ю': 'ju',
    'я': 'ya',
    'π': 'pi',
    'ı': 'i',
    'ə': 'e',
    'ل': 'j',
    'ƒ': 'f',
    'ﬁ': 'fi',
    '\xad': '-',
    'µ': 'mu',
    '\u200b': ' ',
    'ː': ':',
    '—': '-',
    '−': '-',
    '–': '-',
    '”': '"',
    '“': '"',
    '«': '"',
    '»': '"',
    'у': 'y',
    '’': '"',
    '‘': '"',
    '`': "'",
    '„': '"',
    '·': ',',
    '•': ',',
    '…': ' ',
    # https://www.redhat.com/archives/fedora-extras-commits/2007-June/msg03617.html
    "\u0621": "'", # hamza-on-the-line
    "\u0622": "|", # madda
    "\u0623": ">", # hamza-on-'alif
    "\u0624": "&", # hamza-on-waaw
    "\u0625": "<", # hamza-under-'alif
    "\u0626": "}", # hamza-on-yaa'
    "\u0627": "A", # bare 'alif
    "\u0628": "b", # baa'
    "\u0629": "p", # taa' marbuuTa
    "\u062A": "t", # taa'
    "\u062B": "v", # thaa'
    "\u062C": "j", # jiim
    "\u062D": "H", # Haa'
    "\u062E": "x", # khaa'
    "\u062F": "d", # daal
    "\u0630": "*", # dhaal
    "\u0631": "r", # raa'
    "\u0632": "z", # zaay
    "\u0633": "s", # siin
    "\u0634": "$", # shiin
    "\u0635": "S", # Saad
    "\u0636": "D", # Daad
    "\u0637": "T", # Taa'
    "\u0638": "Z", # Zaa' (DHaa')
    "\u0639": "E", # cayn
    "\u063A": "g", # ghayn
    "\u0640": "_", # taTwiil
    "\u0641": "f", # faa'
    "\u0642": "q", # qaaf
    "\u0643": "k", # kaaf
    "\u0644": "l", # laam
    "\u0645": "m", # miim
    "\u0646": "n", # nuun
    "\u0647": "h", # haa'
    "\u0648": "w", # waaw
    "\u0649": "Y", # 'alif maqSuura
    "\u064A": "y", # yaa'
    "\u064B": "F", # fatHatayn
    "\u064C": "N", # Dammatayn
    "\u064D": "K", # kasratayn
    "\u064E": "a", # fatHa
    "\u064F": "u", # Damma
    "\u0650": "i", # kasra
    "\u0651": "~", # shaddah
    "\u0652": "o", # sukuun
    "\u0670": "`", # dagger 'alif
    "\u0671": "{", # waSlaﬁ
}

In [41]:
from urllib.request import urlopen
import urllib


def download_from_the_internet(url):
    try:
        return urlopen(url).read().decode('utf-8')
    except KeyboardInterrupt:
        raise
    except urllib.error.HTTPError as e:
        code = e.code
        if code != 404:
            print(e)
        return code
    except Exception as e:
        print(e)

In [42]:
# https://stackoverflow.com/questions/34753821/remove-diacritics-from-string-for-search-function

def shave_marks(txt):
    """This method removes all diacritic marks from the given string"""
    norm_txt = unicodedata.normalize('NFD', txt)
    shaved = ''.join(c for c in norm_txt if not unicodedata.combining(c))
    return unicodedata.normalize('NFC', shaved)


def is_english_letters(string):
    return re.search(r'[^a-zA-Z0-9°_©®™;§,№!#@.×:+=()/£¥€$|<>~{}\\\[\]%&*^?"\'-]', string) is None


def try_transliterate(query):
    query = unicodedata.normalize('NFC', shave_marks(query).lower())
    try:
        return transliterate.translit(query, reversed=True)
    except transliterate.exceptions.LanguageDetectionError as query_error:
        transliteration = []
        for word in query.split():
            if is_english_letters(word):
                transliteration.append(word)
            else:
                try:
                    transliteration.append(transliterate.translit(word, reversed=True))
                except transliterate.exceptions.LanguageDetectionError as e:
                    new_word = []
                    for ch in word:
                        translited_ch = TRANSLITERATE_DICT.get(ch, ch)
                        new_word.append(translited_ch)
                    transliteration.append(''.join(new_word))
        return ' '.join(transliteration) 

Применяем транслитерацию к запросам из обучающей выборки и сохраняем что получилось

In [43]:
with open('req_ans_learn.tsv', encoding='utf-8-sig') as train_file, open('transliterated_learn.tsv', 'w') as transliterated_learn_file:
    for line in tqdm.tqdm(train_file):
        query, url_end = line.strip().split('\t')
        transliteration = try_transliterate(query)
        transliterated_learn_file.write(transliteration + '\t' + url_end + '\n')

50786it [00:06, 9098.86it/s]


KeyboardInterrupt: 

Достанем текст из запроса

In [3]:
# https://www.quora.com/How-can-I-extract-only-text-data-from-HTML-pages

def informative(element):
    if element.parent.name in ['style', 'script', '[document]', 'head', 'title']:
        return False
    elif re.match('<!--.*-->', str(element.encode('utf-8'))):
        return False
    elif len(element) < 10:
        return False
    return True


def get_text(html):
    soup = BeautifulSoup(html)
    data = soup.findAll(text=True)

    text = []
    informative_lines = [line for line in data if informative(line)]
    started = False
    for line in data:
        if line.strip() == 'Jump to search':
            started = True
            continue
        if started and informative(line):
            if re.search(r'Cached time: \d+\nCache expiry: \d+', line) is not None:
                break
            text.append(line.strip())
    
    return text

In [4]:
BEGINNING = 'https://simple.wikipedia.org'

In [None]:
get_text(file_storage.read(BEGINNING + '/wiki/Germany'))

Сделаем энкодинг запросов и документов.
Запросы будет кодирвать по ngram'ам, а документы по словам и триграммам.

In [5]:
def get_words(string):
    return [word for word in re.split('[,.!?;:() \t\n]', str(string).lower()) if word]


def get_ngrams(n, word_list):
    if n == 1:
        return ''.join(word_list)
    else:
        return [
            word[i:i + n]
            for word in word_list
            for i in range(len(word) - n + 1)
        ]


def get_n_gram_counter(n, collection):
    counter = Counter()
    for element in collection:
        counter.update(get_ngrams(n, get_words(element)))
    return counter

In [6]:
class Storage:
    def __init__(self, elements):
        self._ind_to_elem = elements
        self._elem_to_ind = {elem: ind for ind, elem in enumerate(elements)}
        
    def get_elem(self, ind):
        return self._ind_to_elem[ind]
    
    def get_ind(self, elem, default=None):
        return self._elem_to_ind.get(elem, default)
    
    def __len__(self):
        return len(self._ind_to_elem)
    
    @classmethod
    def from_counter(cls, counter, n_most_common):
        return Storage([elem for elem, _ in counter.most_common(n_most_common)])
    

class Encoder:
    def encode(self, example):
        raise NotImplemented()

    def encode_with_padding(self, examples):
        codes = [self.encode(example) for example in examples]
        max_len = max(map(len, codes))
        return np.array([
            np.concatenate([code, np.zeros(max_len - len(code))])
            for code in codes
        ], dtype=np.int32)


class BagOfNgramsEncoder(Encoder):
    def __init__(self, collection, ngram_number_array):
        self._ngram_storages = [
            Storage.from_counter(get_n_gram_counter(n + 1, collection), ngram_number)
            for n, ngram_number in enumerate(ngram_number_array)
        ]
        self._code_size = sum(map(len, self._ngram_storages)) + 2
    
    @property
    def n(self):
        return len(self._ngram_storages)
    
    def _encode_word(self, word):
        code = []
        if len(word) > 2:
            for i in range(len(word) - self.n + 1):
                ind = 1
                for ngram_len in reversed(range(1, self.n + 1)):
                    ngram = word[i:i + ngram_len]
                    ngram_ind = self._ngram_storages[ngram_len - 1].get_ind(ngram)
                    if ngram_ind is None:
                        ind += len(self._ngram_storages[ngram_len - 1])
                    else:
                        ind += ngram_ind
                        code.append(ind)
                        break
                else:
                    code.append(ind)
        elif len(word) == 2:
            bigram_ind = self._ngram_storages[1].get_ind(word)
            if bigram_ind is not None:
                return [bigram_ind]
            else:
                return [
                    self._ngram_storages[0].get_ind(word[0], self._code_size - 1),
                    self._ngram_storages[0].get_ind(word[1], self._code_size - 1),
                ]
        elif len(word) == 1:
            return [self._ngram_storages[0].get_ind(word, self._code_size - 1)]
        return code

    def encode(self, string):
        return sum((self._encode_word(word) for word in get_words(string)), [])
        
    @property
    def code_size(self):
        return self._code_size

In [7]:
class DocEncoder(Encoder):    
    def __init__(self, storage, collection, n_words, n_trigrams):
        trigram_counter = Counter()
        word_counter = Counter()
        
        for example in tqdm.tqdm(set(collection)):
            html = storage.read(BEGINNING + example)
            if html is None:
                continue
            text = get_text(html)
            words, trigrams = self._prepare_example(text)
            word_counter.update(words)
            trigram_counter.update(trigrams)

        self._word_storage = Storage.from_counter(word_counter, n_words)
        self._trigram_storage = Storage.from_counter(trigram_counter, n_trigrams)    
    
    @staticmethod
    def _prepare_example(example):
        words = []
        trigrams = []
        for example_part in example:
            curr_words = get_words(example_part)
            words += curr_words
            trigrams += get_ngrams(3, curr_words)
        return words, trigrams
    
    def _encode_word(self, word):
        word_code = self._word_storage.get_ind(word)
        if word_code is None:
            return [
                1 + len(self._word_storage) +
                self._trigram_storage.get_ind(trigram, self._unk_ind - 1 - len(self._word_storage))
                for trigram in get_ngrams(3, [word])
            ]
        else:
            return [word_code + 1]
    
    def encode(self, example):
        return sum(
            (
                self._encode_word(word)
                for example_part in example
                for word in get_words(example_part)
            ), []
        )
                 
    @property
    def _unk_ind(self):
        return self.code_size - 1
                   
    @property
    def _pad_ind(self):
        return 0
                                                  
    @property
    def code_size(self):
        return len(self._word_storage) + len(self._trigram_storage) + 2

Это я юзал, когда пытался каждый токен в док-те обрабатывать отдельно. Если тупо засовывать весь документ не хватало памяти на сколько-нибудь адекватную сетку. Поэтому сделал такой костыль: семплируем n подряд идущих токенов из док-та и юзаем в данном батче только их

In [8]:
class EncodeWithSampling(Encoder):
    def __init__(self, codes_storage, doc_encode_len, code_size):
        self._storage = codes_storage
        self._doc_encode_len = doc_encode_len
        self._code_size = code_size
        
    def encode(self, doc_name):
        code = self._storage.read(doc_name).split(',')
        if len(code) > self._doc_encode_len:
            ind = np.random.randint(len(code) - self._doc_encode_len + 1)
            code = code[ind:ind + self._doc_encode_len]
        return list(map(int, code))
    
    @property
    def code_size(self):
        return self._code_size

Разобьем данные на train и val и сохраним

In [9]:
def read_data(path):
    data = pd.read_csv(path, sep='\t', header=None).values
    return data[:, 0], data[:, 1]


def dump(path, *arrays):
    data_frame = pd.DataFrame(data=np.transpose(np.array(arrays)))
    data_frame.to_csv(path, sep='\t', header=False, index=False)

In [None]:
from sklearn.model_selection import train_test_split


queries, docs = read_data('transliterated_learn.tsv')
queries_train, queries_val, docs_train, docs_val = train_test_split(queries, docs, test_size=0.05, random_state=1)

In [None]:
dump('train.tsv', queries_train, docs_train)
dump('val.tsv', queries_val, docs_val)

In [10]:
queries_train, docs_train = read_data('train.tsv')
queries_val, docs_val = read_data('val.tsv')

In [15]:
queries_train[3]

'artura rokhama'

In [16]:
docs_train[3]

'/wiki/Arthur_Rackham'

Сделаем еще отдельно файлы, где есть прям название док-тов, чтобы попытаться обучится только на название док-та не исопльзуя его содержимое (попытка неудачная, конечно же)

In [18]:
doc_train_names = [
    re.sub('_', ' ', name[6:])
    for name in docs_train
]


In [20]:
doc_val_names = [
    re.sub('_', ' ', name[6:])
    for name in docs_val
]

In [21]:
dump('train_with_names.tsv', queries_train, doc_train_names)
dump('val_with_names.tsv', queries_val, doc_val_names)

In [35]:
queries_train, doc_train_names = read_data('train_with_names.tsv')
queries_val, doc_val_names = read_data('val_with_names.tsv')

In [22]:
query_encoder = BagOfNgramsEncoder(queries_train, [600, 2500, 20000])

In [13]:
doc_word_num = 65000
doc_trigram_num = 10000
doc_code_size = doc_word_num + doc_trigram_num + 2

In [None]:
doc_encoder = DocEncoder(file_storage, docs_train, doc_word_num, doc_trigram_num)

Сохраним кодировщик док-тов, потому что строится он долго

In [None]:
with open('big_big_doc_encoder.pkl', 'wb') as doc_encoder_file:
    pickle.dump(doc_encoder, doc_encoder_file)

In [None]:
with open('big_big_doc_encoder.pkl', 'rb') as doc_encoder_file:
    doc_encoder = pickle.load(doc_encoder_file)

In [None]:
doc_encoder.code_size

Сохраним так же и сами коды док-тов, они тоже строятся долго

In [12]:
encoded_docs = FileStorage('encoded_docs', need_compression=False)

In [None]:
for doc in tqdm.tqdm(set(np.concatenate([docs_train, docs_val]))):
    if doc not in encoded_docs and BEGINNING + doc in file_storage:
        code = doc_encoder.encode(get_text(file_storage.read(BEGINNING + doc)))
        encoded_docs.write(doc, ','.join(map(str, code)))

Сначала делал так:
Закодированные запросы и док-ты подаю в керасовский Embeding, а потом прогоняю через lstm. Но с док-тами это сделать не получались из-за их размеров. Перешел на GlobalPooling, но все равно все было оч медленно, даже с EncodeWithSampling. Да и глупо как-то, ведь если сделать bag of words и от него денс, это по факту тоже самое, что Embdeing + Pooling

In [14]:
def get_tiled(x_positive, y_positive, y_negative, batch_size, negative_cnt):
    return [
        np.tile(x_positive, (1 + negative_cnt, 1)),
        np.concatenate([y_positive, np.repeat(y_negative, batch_size, axis=0)], axis=0),
    ]


def batch_generator(queries, doc_names, doc_encoder, query_encoder, batch_size, negative_cnt=2):
    allowed_inds = [ind for ind, name in enumerate(doc_names) if name in doc_encoder._storage]
    while True:
        try:
            indexes = np.random.choice(allowed_inds, batch_size + negative_cnt, replace=False)
            batch_indexes = indexes[:batch_size]
            query_codes = query_encoder.encode_with_padding(queries[indexes])
            positive_query_codes = query_codes[:batch_size]
            negative_query_codes = query_codes[batch_size:]
            doc_codes = doc_encoder.encode_with_padding(doc_names[batch_indexes])
            if len(np.where(doc_codes >= doc_encoder.code_size)[0]) == 0:
                yield (
                    get_tiled(doc_codes, positive_query_codes, negative_query_codes, batch_size, negative_cnt),
                    np.concatenate([np.ones(batch_size), np.zeros(batch_size * negative_cnt)]),
                )
        except:
            pass

In [28]:
def get_dense(units, activation='relu'):
    return keras.layers.Dense(
        units, activation=activation, kernel_regularizer=keras.regularizers.l2(0.01),
    )


def get_embed(embed_layers, data):
    for layer in embed_layers:
        data = layer(data)
    return data

In [None]:
doc = keras.layers.Input(shape=(None,), dtype='int32')
query = keras.layers.Input(shape=(None,), dtype='int32')

doc_embed_layers = [
    keras.layers.Embedding(doc_encoder_with_sampling.code_size, 128),
    keras.layers.GlobalAveragePooling1D(),
    keras.layers.Dropout(0.2),
    get_dense(128),
    keras.layers.Dropout(0.2),
    get_dense(128),
    keras.layers.Dropout(0.2),
    get_dense(128, activation=None),
]

query_embed_layers = [
    keras.layers.Embedding(query_encoder.code_size, 128),
    keras.layers.LSTM(128),
    keras.layers.Dropout(0.2),
    get_dense(128),
    keras.layers.Dropout(0.2),
    get_dense(128),
    keras.layers.Dropout(0.2),
    get_dense(128, activation=None),
]

doc_embed = get_embed(doc_embed_layers, doc)
query_embed = get_embed(query_embed_layers, query)

similarity = keras.layers.Dot(axes=1, normalize=True)([doc_embed, query_embed])

model = keras.models.Model(
    inputs=[doc, query],
    outputs=similarity,
)

model.compile(keras.optimizers.Adam(lr=1e-4), loss='binary_crossentropy')

In [None]:
batch_size = 32
negative_cnt = 2

training_history = model.fit_generator(
    generator=batch_generator(
        queries_train, docs_train, doc_encoder_with_sampling,
        query_encoder, batch_size, negative_cnt
    ),
    epochs=50,
    steps_per_epoch=1000,
    callbacks=[
        keras.callbacks.TensorBoard(batch_size=batch_size),
        keras.callbacks.ModelCheckpoint('first_dssm.bin', monitor='val_loss', save_best_only=True),
        keras.callbacks.ReduceLROnPlateau(factor=0.2, patience=3, min_delta=0.0002),
    ],
    validation_data=batch_generator(
        queries_val, docs_val, doc_encoder_with_sampling,
        query_encoder, batch_size, negative_cnt
    ),
    validation_steps=200,
)

Дальше перехожу собственно к bag of words для документов

In [29]:
def linear_loss(similarity, labels):
    return 1 - keras.backend.mean(similarity * labels)


def mean_negative_score(score, labels):
    return keras.backend.sum(score * (1 - labels)) / keras.backend.sum(1 - labels)


def mean_positive_score(score, labels):
    return keras.backend.sum(score * labels) / keras.backend.sum(labels)

In [21]:
def get_sparse_docs(encoded_docs, doc_code_size, doc_names):
    doc_data = []
    doc_col_indexes = []
    doc_indptr = []
    for doc in doc_names:            
        doc_counter = Counter([
            int(code) % doc_code_size
            for code in encoded_docs.read(doc).split(',') if code
        ])
        doc_indptr.append(len(doc_col_indexes))
        for token_code, cnt in doc_counter.items():
            doc_data.append(cnt)
            doc_col_indexes.append(token_code)
    doc_indptr.append(len(doc_col_indexes))
    return csr_matrix(
        (doc_data, doc_col_indexes, doc_indptr), shape=(len(doc_names), doc_code_size)
    )


def sparse_docs_batch_generator(
        queries, doc_names, encoded_docs, query_encoder, batch_size, doc_code_size):
    allowed_inds = [ind for ind, name in enumerate(doc_names) if name in encoded_docs]
    while True:
        positive_cnt = batch_size // 2
        negative_cnt = batch_size - positive_cnt
        indexes = np.random.choice(allowed_inds, positive_cnt + 2 * negative_cnt, replace=False)
        query_indexes = np.concatenate([indexes[:positive_cnt], indexes[batch_size:]])
        query_codes = query_encoder.encode_with_padding(queries[query_indexes])

        yield (
            [get_sparse_docs(encoded_docs, doc_code_size, doc_names[indexes[:batch_size]]), query_codes],
            np.concatenate([np.ones(positive_cnt), np.zeros(negative_cnt)]),
        )

Думал сделать negative sampling, мб и сделаю когда-то потом, но из-за проблем ниже не дошли до него руки

In [None]:
class NegativeSamplingGenerator:
    def __init__(
            self, queries, doc_names, encoded_docs, query_encoder,
            half_batch_size, doc_code_size, initial_negative_cnt):
        self._queries = queries
        self._doc_names = doc_names
        self._encoded_docs = encoded_docs
        self._query_encoder = query_encoder
        self._half_batch_size = half_batch_size
        self._doc_code_size = doc_code_size
        self._initial_negative_cnt = initial_negative_cnt
        
    def __iter__(self):
        return self
    
    def __next__(self):
        

Пытаюсь значит чего-то обучить и получается плохо

In [22]:
doc = keras.layers.Input(shape=(doc_code_size,), sparse=True)
query = keras.layers.Input(shape=(None,), dtype='int32')

doc_embed_layers = [
    get_dense(256),
    keras.layers.Dropout(0.2),
    get_dense(128),
    keras.layers.Dropout(0.2),
    get_dense(128, activation=None),
]

query_embed_layers = [
    keras.layers.Embedding(query_encoder.code_size, 128),
    keras.layers.LSTM(128),
    keras.layers.Dropout(0.2),
    get_dense(128),
    keras.layers.Dropout(0.2),
    get_dense(128, activation=None),
]

doc_embed = get_embed(doc_embed_layers, doc)
query_embed = get_embed(query_embed_layers, query)

similarity = keras.layers.Dot(axes=1, normalize=True)([doc_embed, query_embed])
score = keras.layers.Activation('sigmoid')(similarity)

model = keras.models.Model(
    inputs=[doc, query],
    outputs=score,
)

model.compile(keras.optimizers.Adam(lr=1e-2), loss='binary_crossentropy', metrics=[
    mean_positive_score, mean_negative_score,
])

In [None]:
batch_size = 32

training_history = model.fit_generator(
    generator=sparse_docs_batch_generator(
        queries_train, docs_train, encoded_docs,
        query_encoder, batch_size, doc_code_size,
    ),
    epochs=20,
    steps_per_epoch=2000,
    callbacks=[
        keras.callbacks.TensorBoard(batch_size=batch_size),
        keras.callbacks.ModelCheckpoint('doc_bow_dssm.bin', monitor='val_loss', save_best_only=True),
        keras.callbacks.ReduceLROnPlateau(factor=0.2, patience=5, min_delta=0.0002),
    ],
    validation_data=sparse_docs_batch_generator(
        queries_val, docs_val, encoded_docs,
        query_encoder, batch_size, doc_code_size,
    ),
    validation_steps=1000,
)

Epoch 1/20
Epoch 2/20
Epoch 3/20

In [None]:
query_encoder.encode('The silence of the Somme: Sound and realism in British and Dutch poems mediating The Battle of the Somme')

In [None]:
query_encoder.encode_with_padding([
        'The silence of the Somme: Sound and realism in British '
        'and Dutch poems mediating The Battle of the Somme',
        'runescape'
    ])

In [21]:
batch = next(sparse_docs_batch_generator(
        queries_train, docs_train, encoded_docs,
        query_encoder, batch_size, doc_code_size,
    ))

In [22]:
model.predict(batch[0])

array([[0.48224148],
       [0.48224148],
       [0.48224148],
       [0.48224148],
       [0.48224148],
       [0.48224148],
       [0.48224148],
       [0.48224148],
       [0.48224148],
       [0.48224148],
       [0.48224148],
       [0.48224148],
       [0.48224148],
       [0.48224148],
       [0.48224148],
       [0.48224148],
       [0.48224148],
       [0.48224148],
       [0.48224148],
       [0.48224148],
       [0.48224148],
       [0.48224148],
       [0.48224148],
       [0.48224148],
       [0.48224148],
       [0.48224148],
       [0.48224148],
       [0.48224148],
       [0.48224148],
       [0.48224148],
       [0.48224148],
       [0.48224148]], dtype=float32)

In [23]:
model.predict([
    get_sparse_docs(encoded_docs, doc_code_size, [
        '/wiki/Battle_of_the_Somme',
        '/wiki/RuneScape',
        '/wiki/Liliaceae',
        '/wiki/Annual_percentage_rate'
    ]),
    query_encoder.encode_with_padding([
        'the silence of the somme: sound and realism in british '
        'and dutch poems mediating the battle of the somme',
        'runescape',
        'liliaceae - tulipa lord beaconsfield + parmesiano',
        'one-tenth of the amount to payday max apr if you borrow one hundred '
        'dollars in a fourteen-day period the highest yearly percentage rate '
        'could be 309 max amount there isn t any specified amount of',
    ]),
])

array([[0.48224148],
       [0.48224148],
       [0.48224148],
       [0.48224148]], dtype=float32)

Пробовал много чего: менять lr, брать больше слов при энкодинге, делать сетку глубже/шире, другой лосс (linear_loss), менять соотношеине позитивов и негативов в батче. Ничего не помогло - сеть во всех случаях сходилась к константым предсказаниям

Дальше тут попытался использовать только имена документов, но разумеется это тоже ни к чему не привело

In [33]:
def doc_name_batch_generator(queries, doc_names, encoder, batch_size, negative_cnt):
    allowed_inds = np.arange(len(doc_names))
    while True:
        positive_cnt = batch_size // 2
        negative_cnt = batch_size - positive_cnt
        indexes = np.random.choice(allowed_inds, positive_cnt + 2 * negative_cnt, replace=False)
        query_indexes = np.concatenate([indexes[:positive_cnt], indexes[batch_size:]])
        query_codes = encoder.encode_with_padding(queries[query_indexes])
        doc_name_codes = encoder.encode_with_padding(doc_names[indexes[:batch_size]])
        yield (
            [doc_name_codes, query_codes],
            np.concatenate([np.ones(positive_cnt), np.zeros(negative_cnt)]),
        )

In [31]:
doc_name = keras.layers.Input(shape=(None,), dtype='int32')
query = keras.layers.Input(shape=(None,), dtype='int32')

doc_embed_layers = [
    keras.layers.Embedding(query_encoder.code_size, 128),
    keras.layers.LSTM(128),
    keras.layers.Dropout(0.2),
    get_dense(128),
    keras.layers.Dropout(0.2),
    get_dense(128, activation=None),
]

query_embed_layers = [
    keras.layers.Embedding(query_encoder.code_size, 128),
    keras.layers.LSTM(128),
    keras.layers.Dropout(0.2),
    get_dense(128),
    keras.layers.Dropout(0.2),
    get_dense(128, activation=None),
]

doc_embed = get_embed(doc_embed_layers, doc_name)
query_embed = get_embed(query_embed_layers, query)

similarity = keras.layers.Dot(axes=1, normalize=True)([doc_embed, query_embed])
score = keras.layers.Activation('sigmoid')(similarity)

model = keras.models.Model(
    inputs=[doc_name, query],
    outputs=score,
)

model.compile(keras.optimizers.Adam(), loss='binary_crossentropy', metrics=[
    mean_positive_score, mean_negative_score,
])

Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.


In [38]:
batch_size = 32
negative_cnt = 16

training_history = model.fit_generator(
    generator=doc_name_batch_generator(
        queries_train, doc_train_names,
        query_encoder, batch_size, negative_cnt,
    ),
    epochs=20,
    steps_per_epoch=2000,
    callbacks=[
        keras.callbacks.TensorBoard(batch_size=batch_size),
        keras.callbacks.ModelCheckpoint('doc_bow_dssm.bin', monitor='val_loss', save_best_only=True),
        keras.callbacks.ReduceLROnPlateau(factor=0.2, patience=5, min_delta=0.0002),
    ],
    validation_data=doc_name_batch_generator(
        queries_val, doc_val_names,
        query_encoder, batch_size, negative_cnt,
    ),
    validation_steps=1000,
)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20

ResourceExhaustedError: OOM when allocating tensor with shape[32,13633,128] and type float on /job:localhost/replica:0/task:0/device:GPU:0 by allocator GPU_0_bfc
	 [[{{node training/Adam/gradients/lstm_3/transpose_grad/transpose}}]]
Hint: If you want to see a list of allocated tensors when OOM happens, add report_tensor_allocations_upon_oom to RunOptions for current allocation info.


Более опытные товарищи подсказали, что такая фигня иногда случается и что, например, добавление словестных биграм может помочь. В общем я еще попытаюсь тут что-то сделать, но пока вот так(