In [1]:
import numpy as np
import pandas as pd
import fastText
import csv
import keras
import word2vec

from collections import Counter

Using TensorFlow backend.


In [1]:
data = pd.read_csv('names_and_rubrics_learn.tsv', sep='\t', header=None).values

In [2]:
data[:10], len(data)

(array([['Автобусная компания Транспорт52',
         'Автобусные междугородные перевозки'],
        ['Мастерок', 'Строительный магазин'],
        ['Суши-бар', 'Суши-бар'],
        ['Экспром', 'Металлоизделия'],
        ['Универсам Бегемот', 'Супермаркет'],
        ['Русстрой', 'Кровля и кровельные материалы'],
        ['Прародомысл', 'Юридические услуги'],
        ['Ростов Арена, блок D418', 'Блок стадиона'],
        ['Детские игровые залы и площадки',
         'Детские игровые залы и площадки'],
        ['ДЮСШ', 'Спортивная школа']], dtype=object), 8908449)

In [3]:
rubric_counter = Counter(data[:, 1])
print(len(rubric_counter))
print(rubric_counter.most_common(20))

1222
[('Остановка общественного транспорта', 265405), ('Банкомат', 230857), ('Магазин продуктов', 222001), ('Детский сад', 191112), ('Платёжный терминал', 159885), ('Общеобразовательная школа', 159067), ('Гостиница', 138324), ('Администрация', 135056), ('Аптека', 125118), ('Автосервис, автотехцентр', 108163), ('Супермаркет', 104133), ('Автомобильная парковка', 99734), ('Банк', 97145), ('Кафе', 92419), ('Магазин автозапчастей и автотоваров', 89529), ('Магазин одежды', 86762), ('Детские игровые залы и площадки', 84761), ('Салон красоты', 82705), ('Жилой комплекс', 71494), ('Денежные переводы', 70914)]


In [4]:
from sklearn.model_selection import train_test_split

X_all = data[:, 0]
y_all = data[:, 1]
X_train, X_test, y_train, y_test = train_test_split(X_all, y_all, test_size=0.05)

In [5]:
len(X_train), len(X_test)

(8463026, 445423)

In [6]:
train_rubrics = np.unique(y_train)
rubrics_in_train_mask = np.isin(train_rubrics, X_train)
rubrics_not_in_train = train_rubrics[np.logical_not(rubrics_in_train_mask)]
X_train = np.concatenate([X_train, rubrics_not_in_train])
y_train = np.concatenate([y_train, rubrics_not_in_train])

In [7]:
len(X_train), len(rubrics_not_in_train), len(train_rubrics)

(8463122, 96, 1222)

In [9]:
len(y_train)

8463122

In [10]:
def dump(path, first_arr, second_arr, sep='\t'):
    data_frame = pd.DataFrame(data=np.transpose(np.array([first_arr, second_arr])))
    data_frame.to_csv(path, sep=sep, header=False, index=False)

In [None]:
dump('train.tsv', X_train, y_train)
dump('val.tsv', X_test, y_test)

In [7]:
def read_data(path):
    data = pd.read_csv(path, sep='\t', header=None).values
    return data[:, 0], data[:, 1]

In [17]:
ind_to_rubric = np.unique(y)
rubric_to_ind = {rubric: ind for ind, rubric in enumerate(ind_to_rubric)}

In [22]:
to_labels = np.vectorize(lambda rubric: '__label__' + str(rubric_to_ind[rubric]))
labels = to_labels(y)
with open('train_with_labels.fasttext', 'w') as f_out:
    batch = ''
    for ind, (label, name) in enumerate(zip(labels, X)):
        batch += label + ' , ' + name + '\n'
        if ind % 1000 == 999:
            f_out.write(batch)
            batch = ''
    if batch:
        f_out.write(batch)
        batch = ''

In [23]:
model = fastText.train_supervised('train_with_labels.fasttext')

In [24]:
model.save_model('fastText_model.fst')

In [4]:
X_val, y_val = read_data('val.tsv')

In [42]:
right = 0
for name, rubric in zip(X_val, y_val):
    predict = model.predict(name)
    rubric_ind = int(predict[0][0][9:])
    predicted_rubric = ind_to_rubric[rubric_ind]
    if predicted_rubric == rubric:
        right += 1

(('__label__939',), array([0.61871117]))
939


ValueError: 

In [8]:
X_train, y_train = read_data('train.tsv')

In [9]:
def get_n_gram_counter(n, collection):
    counter = Counter()
    for element in collection:
        element = element.lower()
        if n == 1:
            counter.update(element)
        else:
            counter.update([
                element[ind:ind + n]
                for ind in range(len(element) - n)
            ])
    return counter

In [10]:
counter = get_n_gram_counter(3, X_train)

In [11]:
print(len([(char, cnt) for char, cnt in counter.items() if cnt > 200]))

15619


In [12]:
counter = get_n_gram_counter(2, X_train)
print(len([(char, cnt) for char, cnt in counter.items() if cnt > 200]))

2168


In [13]:
counter = get_n_gram_counter(1, X_train)
print(len([(char, cnt) for char, cnt in counter.items() if cnt > 200]))

99


In [14]:
class Storage:
    def __init__(self, elements):
        self._ind_to_elem = elements
        self._elem_to_ind = {elem: ind for ind, elem in enumerate(elements)}
        
    def get_elem(self, ind):
        return self._ind_to_elem[ind]
    
    def get_ind(self, elem):
        return self._elem_to_ind.get(elem)
    
    def __len__(self):
        return len(self._ind_to_elem)


class BagOfNgramsEncoder:
    def __init__(self, ngrams_array):
        self._ngram_storages = [Storage(ngrams) for ngrams in ngrams_array]
        self._code_size = sum(map(len, self._ngram_storages)) + 1
        
    @property
    def n(self):
        return len(self._ngram_storages)

    def encode(self, string):
        string = string.lower()
        code = np.zeros(self._code_size)
        
        ones = len(string) - n + 1
        for i in range(ones):
            ind = 0
            for ngram_len in reversed(range(1, n + 1)):
                ngram = string[i:i + ngram_len]
                ngram_ind = self._ngram_storages[ngram_len - 1].get_ind(ngram)
                if ngram_ind is None:
                    ind += len(self._ngram_storages[ngram_len - 1])
                else:
                    ind += ngram_ind
                    break
            code[ind] += 1
        return code / ones
    
    @property
    def code_size(self):
        return self._code_size

In [15]:
counter.most_common(5)

[('о', 14386178),
 ('а', 13928397),
 (' ', 13420874),
 ('и', 10453110),
 ('е', 9608085)]

In [16]:
most_common_ngrams = []
for n, most_common in zip(range(1, 4), [250, 1000, 2000]):
    counter = get_n_gram_counter(n, X_train)
    most_common = counter.most_common(most_common)
    most_common_ngrams.append([ngram for ngram, _ in most_common])

In [17]:
encoder = BagOfNgramsEncoder(most_common_ngrams)

In [18]:
all_indexes = np.arange(len(X_train))

In [19]:
def get_dense(units, activation='relu'):
    return keras.layers.Dense(
        units, activation=activation, kernel_regularizer=keras.regularizers.l2(0.01),
    )

In [20]:
rubric = keras.layers.Input(shape=(encoder.code_size,))
rubric_embed = get_dense(300)(rubric)
rubric_embed = get_dense(200)(rubric_embed)
rubric_embed = get_dense(100, activation=None)(rubric_embed)

name = keras.layers.Input(shape=(encoder.code_size,))
name_embed = get_dense(500)(name)
name_embed = get_dense(500)(name_embed)
name_embed = get_dense(100, activation=None)(name_embed)

similarity = keras.layers.Dot(axes=1, normalize=True)([name_embed, rubric_embed])

model = keras.models.Model(inputs=[name, rubric], outputs=similarity)

Instructions for updating:
Colocations handled automatically by placer.


In [21]:
model.compile(keras.optimizers.Adam(), loss='binary_crossentropy')

In [22]:
def batch_generator(batch_size, negative_cnt=10):
    while True:
        indexes = np.random.choice(all_indexes, batch_size + negative_cnt, replace=False)
        batch_indexes = indexes[:batch_size]
        negative_indexes = indexes[batch_size:]
        x_positive = np.array([encoder.encode(x) for x in X_train[batch_indexes]])
        y_positive = np.array([encoder.encode(x) for x in y_train[batch_indexes]])
        y_negative = np.array([encoder.encode(x) for x in y_train[negative_indexes]])
        yield (
            [
                np.tile(x_positive, (1 + negative_cnt, 1)),
                np.concatenate([y_positive, np.repeat(y_negative, batch_size, axis=0)], axis=0),
            ],
            np.concatenate([np.ones(batch_size), np.zeros(batch_size * negative_cnt)])
        )

In [23]:
batch_size = 32
model.fit_generator(
    generator=batch_generator(batch_size, 1), epochs=1,
    steps_per_epoch=int(len(X_train) / batch_size)
)

Instructions for updating:
Use tf.cast instead.
Epoch 1/1
     3/264472 [..............................] - ETA: 139:01:24 - loss: 25.8271



   454/264472 [..............................] - ETA: 23:03:03 - loss: nan

KeyboardInterrupt: 

In [62]:
np.where(encoder.encode('Паркмахерская у Рамиля') != 0)

(array([   0,   10,   21,   62,   74,   93,  130,  138,  885,  913,  999,
        1001, 1033, 1054, 1067, 1409, 1590, 2052, 3811, 5141]),)

In [57]:
model.predict([np.array([encoder.encode('Больница')]), np.array([encoder.encode('Магазин')])])

array([[0.16666147]], dtype=float32)

In [2]:
word2vec.word2phrase('train.tsv', 'train_phrase', verbose=True)

Starting training using file train.tsv
Words processed: 39500K     Vocab size: 9741K  
Vocab size (unigrams + bigrams): 5277328
Words in train file: 39528419
Words written: 39500K

In [3]:
word2vec.word2vec('train_phrase', 'word2vec.bin', size=300, verbose=True)

Starting training using file train_phrase
Vocab size: 274788
Words in train file: 35951490
Alpha: 0.000002  Progress: 100.00%  Words/thread/sec: 129.10k   Progress: 7.05%  Words/thread/sec: 139.69k  138.30k  .32k  ha: 0.008287  Progress: 66.86%  Words/thread/sec: 133.07k  73.43%  Words/thread/sec: 132.12k  123  Progress: 99.51%  Words/thread/sec: 129.05k  

In [4]:
w2v_model = word2vec.load('word2vec.bin')

UnicodeDecodeError: 'utf-8' codec can't decode byte 0xd0 in position 57: invalid continuation byte