In [1]:
import numpy as np
import pandas as pd
import fastText
import csv
import keras
import word2vec

from collections import Counter
from sklearn.model_selection import train_test_split

Using TensorFlow backend.


Смотрим на данные

In [1]:
data = pd.read_csv('names_and_rubrics_learn.tsv', sep='\t', header=None).values

In [2]:
data[:10], len(data)

(array([['Автобусная компания Транспорт52',
         'Автобусные междугородные перевозки'],
        ['Мастерок', 'Строительный магазин'],
        ['Суши-бар', 'Суши-бар'],
        ['Экспром', 'Металлоизделия'],
        ['Универсам Бегемот', 'Супермаркет'],
        ['Русстрой', 'Кровля и кровельные материалы'],
        ['Прародомысл', 'Юридические услуги'],
        ['Ростов Арена, блок D418', 'Блок стадиона'],
        ['Детские игровые залы и площадки',
         'Детские игровые залы и площадки'],
        ['ДЮСШ', 'Спортивная школа']], dtype=object), 8908449)

In [3]:
rubric_counter = Counter(data[:, 1])
print(len(rubric_counter))
print(rubric_counter.most_common(20))

1222
[('Остановка общественного транспорта', 265405), ('Банкомат', 230857), ('Магазин продуктов', 222001), ('Детский сад', 191112), ('Платёжный терминал', 159885), ('Общеобразовательная школа', 159067), ('Гостиница', 138324), ('Администрация', 135056), ('Аптека', 125118), ('Автосервис, автотехцентр', 108163), ('Супермаркет', 104133), ('Автомобильная парковка', 99734), ('Банк', 97145), ('Кафе', 92419), ('Магазин автозапчастей и автотоваров', 89529), ('Магазин одежды', 86762), ('Детские игровые залы и площадки', 84761), ('Салон красоты', 82705), ('Жилой комплекс', 71494), ('Денежные переводы', 70914)]


Делим выборку на обучение, валидацию и тест

In [4]:
X_all = data[:, 0]
y_all = data[:, 1]
X_train, X_val_and_test, y_train, y_val_and_test = train_test_split(X_all, y_all, test_size=0.05)
X_val, X_test, y_val, y_test = train_test_split(
    X_val_and_test, y_val_and_test, test_size=0.5, random_state=57,
)

In [5]:
len(X_train), len(X_test)

(8463026, 445423)

Добавим в обучающие примеры организации, имя которых совпадает с рубрикой

In [12]:
train_rubrics = np.unique(y_train)

In [6]:
rubrics_in_train_mask = np.isin(train_rubrics, X_train)
rubrics_not_in_train = train_rubrics[np.logical_not(rubrics_in_train_mask)]
X_train = np.concatenate([X_train, rubrics_not_in_train])
y_train = np.concatenate([y_train, rubrics_not_in_train])

Сохраним полученные даныне

In [3]:
def dump(path, first_arr, second_arr, sep='\t'):
    data_frame = pd.DataFrame(data=np.transpose(np.array([first_arr, second_arr])))
    data_frame.to_csv(path, sep=sep, header=False, index=False)

In [None]:
dump('train.tsv', X_train, y_train)
dump('val.tsv', X_val, y_val)
dump('test.tsv', X_test, y_test)

In [2]:
def read_data(path):
    data = pd.read_csv(path, sep='\t', header=None).values
    return data[:, 0], data[:, 1]

Создадим маппинг рубрики в иднекс и обратно

In [13]:
ind_to_rubric = np.unique(y_train)
rubric_to_ind = {rubric: ind for ind, rubric in enumerate(ind_to_rubric)}

In [11]:
X_train, y_train = read_data('train.tsv')

In [6]:
X_val, y_val = read_data('val.tsv')

Будем разбивать названия на ngram'ы. Код слова - индексы ngram, которые в нем встретились.

In [9]:
def normalize_str(string):
    return string.lower()


def get_n_gram_counter(n, collection):
    counter = Counter()
    for element in collection:
        element = normalize_str(element)
        if n == 1:
            counter.update(element)
        else:
            counter.update([
                element[ind:ind + n]
                for ind in range(len(element) - n)
            ])
    return counter

In [6]:
class Storage:
    def __init__(self, elements):
        self._ind_to_elem = elements
        self._elem_to_ind = {elem: ind for ind, elem in enumerate(elements)}
        
    def get_elem(self, ind):
        return self._ind_to_elem[ind]
    
    def get_ind(self, elem):
        return self._elem_to_ind.get(elem)
    
    def __len__(self):
        return len(self._ind_to_elem)


class BagOfNgramsEncoder:
    def __init__(self, ngrams_array, max_size):
        self._ngram_storages = [Storage(ngrams) for ngrams in ngrams_array]
        self._code_size = sum(map(len, self._ngram_storages)) + 2
        self._max_size = max_size - self.n + 1
        
    @property
    def max_size(self):
        return self._max_size
    
    @property
    def n(self):
        return len(self._ngram_storages)

    def encode(self, string, max_size=None):
        if max_size is None:
            max_size = self.max_size
        else:
            max_size = max_size - self.n + 1
        string = normalize_str(string)
        code = []
        
        real_len = min(len(string) - self.n + 1, max_size)
        for i in range(real_len):
            ind = 0
            for ngram_len in reversed(range(1, self.n + 1)):
                ngram = string[i:i + ngram_len]
                ngram_ind = self._ngram_storages[ngram_len - 1].get_ind(ngram)
                if ngram_ind is None:
                    ind += len(self._ngram_storages[ngram_len - 1])
                else:
                    ind += ngram_ind
                    break
            code.append(ind)
        code += [self.code_size - 1] * (max_size - real_len)
        return np.array(code)
    
    @property
    def code_size(self):
        return self._code_size

In [17]:
def encode_array(array, max_size=None):
    max_size = max_size or max(map(len, array))
    return np.array([encoder.encode(x, max_size) for x in array])

In [11]:
most_common_ngrams = []
for n, most_common in zip(range(1, 4), [300, 2200, 7500]):
    counter = get_n_gram_counter(n, X_train)
    most_common = counter.most_common(most_common)
    most_common_ngrams.append([ngram for ngram, _ in most_common])

In [12]:
max_size = max(map(len, X_train))
encoder = BagOfNgramsEncoder(most_common_ngrams, max_size=max_size)

In [13]:
def get_dense(units, activation='relu'):
    return keras.layers.Dense(
        units, activation=activation, kernel_regularizer=keras.regularizers.l2(0.01),
    )

In [30]:
model = keras.models.Sequential()
model.add(keras.layers.Embedding(encoder.code_size, 128))
model.add(keras.layers.LSTM(128))
model.add(keras.layers.Dropout(0.2))
model.add(keras.layers.BatchNormalization())
model.add(get_dense(128))
model.add(keras.layers.Dropout(0.2))
model.add(keras.layers.BatchNormalization())
model.add(get_dense(len(rubric_to_ind), activation='softmax'))

model.compile(loss='sparse_categorical_crossentropy', metrics=['accuracy'], optimizer=keras.optimizers.Adam())

In [18]:
def rubric_array_to_ind(rubrics):
    return np.array([rubric_to_ind[rubric] for rubric in rubrics])


def classification_batch_generator(X_data, y_data, batch_size):
    while True:
        batch_indexes = np.random.choice(np.arange(len(X_data)), batch_size, replace=False)
        yield (
            encode_array(X_data[batch_indexes]),
            rubric_array_to_ind(y_data[batch_indexes]),
        )

Запускаем обучение

In [34]:
batch_size = 128

training_history = model.fit_generator(
    generator=classification_batch_generator(X_train, y_train, batch_size), epochs=10,
    steps_per_epoch=10000,
    callbacks=[
        keras.callbacks.TensorBoard(batch_size=batch_size),
    ],
    validation_data=classification_batch_generator(X_val, y_val, batch_size),
    validation_steps=200,
    initial_epoch=3,
)

Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [35]:
model.save('keras_classifier.bin')

In [3]:
X_test, y_test = read_data('test.tsv')

In [41]:
model.evaluate(encode_array(X_test), rubric_array_to_ind(y_test))



[2.8022175528733455, 0.5437246309156077]

In [65]:
for ind in np.argmax(model.predict(encode_array(['Кафе у Ашота', 'Поликлиника №28'])), axis=1):
    print(ind_to_rubric[ind])

Кафе
Детская поликлиника


Запишем ответ для kaggle

In [14]:
def write_answer_to_file(file_obj, model, names, start_ind):
    rubric_inds = np.argmax(model.predict(encode_array(names)), axis=1)
    file_obj.write('\n'.join([
        str(start_ind + i) + ',"' + ind_to_rubric[r_ind] + '"'
        for i, r_ind in enumerate(rubric_inds)
    ]))
    file_obj.write('\n')


def create_answer(
        model, batch_size=128,
        out_path='answer.csv', test_data_path='names_and_rubrics_test_no_rubric.tsv'):
    with open(test_data_path) as test_data_file, open(out_path, 'w') as out_file:
        out_file.write('Id,Category\n')
        names = []
        
        for ind, line in enumerate(test_data_file):
            names.append(line.strip())
            if len(names) == batch_size:
                write_answer_to_file(out_file, model, names, 2 + ind - batch_size)
                names = []
        if names:
            write_answer_to_file(out_file, model, names, 2 + ind - len(names))

In [66]:
create_answer(model, batch_size=256)

Теперь давайте попробуем более толстую модель

In [15]:
most_common_ngrams = []
for n, most_common in zip(range(1, 4), [300, 2700, 15000]):
    counter = get_n_gram_counter(n, X_train)
    most_common = counter.most_common(most_common)
    most_common_ngrams.append([ngram for ngram, _ in most_common])

In [16]:
max_size = max(map(len, X_train))
encoder = BagOfNgramsEncoder(most_common_ngrams, max_size=max_size)

In [83]:
def add_dense_with_dropout_and_batch_norm(model, *args, **kwargs):
    dropout = kwargs.pop('dropout', 0.2)
    model.add(keras.layers.BatchNormalization())
    model.add(keras.layers.Dropout(dropout))
    model.add(get_dense(*args, **kwargs))

model = keras.models.Sequential()
model.add(keras.layers.Embedding(encoder.code_size, 256))
model.add(keras.layers.LSTM(256, return_sequences=True))
model.add(keras.layers.LSTM(256))
add_dense_with_dropout_and_batch_norm(model, 256)
add_dense_with_dropout_and_batch_norm(model, 256)
add_dense_with_dropout_and_batch_norm(model, len(rubric_to_ind), activation='softmax')

model.compile(loss='sparse_categorical_crossentropy', metrics=['accuracy'], optimizer=keras.optimizers.Adam())

In [86]:
batch_size = 128

training_history = model.fit_generator(
    generator=classification_batch_generator(X_train, y_train, batch_size), epochs=200,
    steps_per_epoch=1000,
    callbacks=[
        keras.callbacks.TensorBoard(batch_size=batch_size),
        keras.callbacks.ModelCheckpoint('big_classifier.bin', monitor='val_acc', save_best_only=True),
        keras.callbacks.ReduceLROnPlateau(factor=0.2, patience=5, min_delta=0.001),
    ],
    validation_data=classification_batch_generator(X_val, y_val, batch_size),
    validation_steps=200,
    initial_epoch=111,
)

Epoch 112/200
Epoch 113/200
Epoch 114/200
Epoch 115/200
Epoch 116/200
Epoch 117/200
Epoch 118/200
Epoch 119/200
Epoch 120/200
 114/1000 [==>...........................] - ETA: 5:46 - loss: 2.3649 - acc: 0.5850

KeyboardInterrupt: 

In [22]:
best_model = keras.models.load_model('big_classifier.bin')

In [88]:
best_model.evaluate(encode_array(X_test), rubric_array_to_ind(y_test))



[2.292465568346737, 0.5932998670907862]

In [4]:
best_model_2 = keras.models.load_model('big_good_cls.bin')

Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.
Instructions for updating:
Use tf.cast instead.


In [19]:
best_model_2.evaluate(encode_array(X_test), rubric_array_to_ind(y_test))



[2.29311796141457, 0.5934570207241782]

In [23]:
create_answer(best_model, batch_size=256)

Ниже будут неудачные попытки, сохраню их для истории

# fastText

In [22]:
to_labels = np.vectorize(lambda rubric: '__label__' + str(rubric_to_ind[rubric]))
labels = to_labels(y)
with open('train_with_labels.fasttext', 'w') as f_out:
    batch = ''
    for ind, (label, name) in enumerate(zip(labels, X)):
        batch += label + ' , ' + name + '\n'
        if ind % 1000 == 999:
            f_out.write(batch)
            batch = ''
    if batch:
        f_out.write(batch)
        batch = ''

In [23]:
model = fastText.train_supervised('train_with_labels.fasttext')

In [24]:
model.save_model('fastText_model.fst')

In [67]:
model_fast_text = fastText.load_model('fastText_model.fst')

In [70]:
right = 0
for name, rubric in zip(X_val, y_val):
    predict = model_fast_text.predict(name)
    rubric_ind = int(predict[0][0][9:])
    predicted_rubric = ind_to_rubric[rubric_ind]
    if predicted_rubric == rubric:
        right += 1
        
print(right / len(X_val))

0.01448963005868592


Дно

# Что-то dssm-оподобное

In [13]:
def get_dense(units, activation='relu'):
    return keras.layers.Dense(
        units, activation=activation, kernel_regularizer=keras.regularizers.l2(0.01),
    )

Тут уже максимально упрощенный вариант сетки, пробовал LSTM, более жирные денсы, разные сетки для рубрик и имен и тд, но ничего особо не помогло

In [15]:
rubric = keras.layers.Input(shape=(None,), dtype='int32')
name = keras.layers.Input(shape=(None,), dtype='int32')

def get_embed(embed_layers, data):
    for layer in embed_layers:
        data = layer(data)
    return data


embed_layers = [
    keras.layers.Embedding(encoder.code_size, 128),
    keras.layers.GlobalAveragePooling1D(),
    get_dense(128),
    get_dense(64, activation=None),
]

rubric_embed = get_embed(embed_layers, rubric)
name_embed = get_embed(embed_layers, name)

similarity = keras.layers.Dot(axes=1, normalize=True)([name_embed, rubric_embed])

model = keras.models.Model(
    inputs=[name, rubric],
    outputs=similarity,
)

In [16]:
model.compile(keras.optimizers.Adam(), loss='binary_crossentropy')

In [21]:
def encode_array(array, max_size=None):
    max_size = max_size or max(map(len, array))
    return np.array([encoder.encode(x, max_size) for x in array])


def get_tiled(x_positive, y_positive, y_negative, batch_size, negative_cnt):
    return [
        np.tile(x_positive, (1 + negative_cnt, 1)),
        np.concatenate([y_positive, np.repeat(y_negative, batch_size, axis=0)], axis=0),
    ]


def transform_data(X_data, y_data, train_rubrics, negative_cnt):
    negative_rubrics = np.random.choice(train_rubrics, negative_cnt, replace=False)
    x_positive = encode_array(X_data)
    y_max_size = max([max(map(len, y_data)), max(map(len, negative_rubrics))])
    y_positive = encode_array(y_data, y_max_size)
    y_negative = encode_array(negative_rubrics, y_max_size)
    return (
        get_tiled(x_positive, y_positive, y_negative, batch_size, negative_cnt),
        np.concatenate([np.ones(batch_size), np.zeros(batch_size * negative_cnt)])
    )


def batch_generator(X_data, y_data, train_rubrics, batch_size, negative_cnt=3):
    while True:
        batch_indexes = np.random.choice(np.arange(len(X_data)), batch_size, replace=False)
        yield transform_data(X_data[batch_indexes], y_data[batch_indexes], train_rubrics, negative_cnt)

        
def generator_from_file(path, batch_size, train_rubrics, negative_cnt=3):
    X_batch = []
    y_batch = []
    
    while True:
        with open(path) as input_file:
            for line in input_file:
                X, y = line.split('\t')
                X_batch.append(X)
                y_batch.append(y)
                if len(X_batch) == batch_size:
                    yield transform_data(X_batch, y_batch, train_rubrics, negative_cnt)
                    X_batch = []
                    y_batch = []

In [26]:
batch_size = 64
negative_cnt = 3

training_history = model.fit_generator(
    generator=batch_generator(X_train, y_train, train_rubrics, batch_size, negative_cnt), epochs=1,
    steps_per_epoch=int(len(X_train) / batch_size),
    callbacks=[
        keras.callbacks.TensorBoard(batch_size=batch_size),
    ],
    validation_data=batch_generator(X_val, y_val, train_rubrics, batch_size, negative_cnt),
    validation_steps=200,
)

Epoch 1/1

  % delta_t_median)




  % delta_t_median)




  % delta_t_median)




  % delta_t_median)




  % delta_t_median)




  % delta_t_median)




  % delta_t_median)




  % delta_t_median)




  % delta_t_median)




  % delta_t_median)




  % delta_t_median)




  % delta_t_median)




  % delta_t_median)




  % delta_t_median)




  % delta_t_median)




IndexError: index 3680485 is out of bounds for axis 0 with size 222711

In [62]:
np.where(encoder.encode('Паркмахерская у Рамиля') != 0)

(array([   0,   10,   21,   62,   74,   93,  130,  138,  885,  913,  999,
        1001, 1033, 1054, 1067, 1409, 1590, 2052, 3811, 5141]),)

Все подобные сетки ведут себя примерно так

In [30]:
model.predict([
    np.array([encoder.encode('Детские игровые залы и площадки')]),
    np.array([encoder.encode('Оружейная')]),
])

array([[0.8641187]], dtype=float32)

In [31]:
X_test, y_test = read_data('test.tsv')

In [33]:
encoded_rubrics = encode_array(train_rubrics)

In [34]:
right = 0

for x, y in zip(X_test, y_test):
    predicted_sim = model.predict([
        np.tile(np.array([encoder.encode(x)]), (len(encoded_rubrics), 1)),
        encoded_rubrics,
    ])
    max_ind = np.argmax(predicted_sim)
    if y == train_rubrics[max_ind]:
        right += 1

KeyboardInterrupt: 

In [35]:
right

441

In [37]:
np.where(X_test == x)

(array([9243]),)

И вот такая у них точность - дно. Я их конечно не доучивал до конца, но у классификации точность уже после 10 минут обучения была выше, чем у dssm после нескольких часов обучения.

In [38]:
441 / 9243

0.04771178188899708

# word2vec

In [2]:
word2vec.word2phrase('train.tsv', 'train_phrase', verbose=True)

Starting training using file train.tsv
Words processed: 39500K     Vocab size: 9741K  
Vocab size (unigrams + bigrams): 5277328
Words in train file: 39528419
Words written: 39500K

In [3]:
word2vec.word2vec('train_phrase', 'word2vec.bin', size=300, verbose=True)

Starting training using file train_phrase
Vocab size: 274788
Words in train file: 35951490
Alpha: 0.000002  Progress: 100.00%  Words/thread/sec: 129.10k   Progress: 7.05%  Words/thread/sec: 139.69k  138.30k  .32k  ha: 0.008287  Progress: 66.86%  Words/thread/sec: 133.07k  73.43%  Words/thread/sec: 132.12k  123  Progress: 99.51%  Words/thread/sec: 129.05k  

Вот это кончено самое тупое, я так и не понял, как это пофиксить ;)

In [4]:
w2v_model = word2vec.load('word2vec.bin')

UnicodeDecodeError: 'utf-8' codec can't decode byte 0xd0 in position 57: invalid continuation byte