In [16]:
import nltk
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from pymorphy2 import MorphAnalyzer

from tqdm.notebook import tqdm

import numpy as np
import pandas as pd
import torch

from sklearn.metrics import classification_report, accuracy_score

import warnings
warnings.filterwarnings("ignore")

nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [4]:
torch.cuda.empty_cache()

In [5]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
device

'cuda'

### Training functions

In [6]:
def train_on_batch(model, x_batch, y_batch, optimizer, loss_function):
    model.train()
    optimizer.zero_grad()

    output = model(x_batch.to(model.device))

    loss = loss_function(output, y_batch.to(model.device))
    loss.backward()

    optimizer.step()
    return loss.cpu().item()

In [7]:
def train_epoch(train_generator, model, loss_function, optimizer, callback = None):
    epoch_loss = 0
    total = 0
    for it, (batch_of_x, batch_of_y) in enumerate(train_generator):
        batch_loss = train_on_batch(model, batch_of_x, batch_of_y, optimizer, loss_function)

        if callback is not None:
            with torch.no_grad():
                callback(model, batch_loss)

        epoch_loss += batch_loss*len(batch_of_x)
        total += len(batch_of_x)

    return epoch_loss/total

In [8]:
def trainer(count_of_epoch,
            batch_size,
            dataset,
            model,
            loss_function,
            optimizer,
            lr = 0.001,
            callback = None):

    optima = optimizer(model.parameters(), lr=lr)

    iterations = tqdm(range(count_of_epoch), desc='epoch')
    iterations.set_postfix({'train epoch loss': np.nan})
    for it in iterations:
        batch_generator = tqdm(
            torch.utils.data.DataLoader(dataset=dataset,
                                        batch_size=batch_size,
                                        shuffle=True, pin_memory=True),
            leave=False, total=len(dataset)//batch_size+(len(dataset)%batch_size>0))

        epoch_loss = train_epoch(train_generator=batch_generator,
                    model=model,
                    loss_function=loss_function,
                    optimizer=optima,
                    callback=callback)

        iterations.set_postfix({'train epoch loss': epoch_loss})

### Loading a dataset and train test split

In [11]:
dataset = pd.read_csv("train.txt")
dataset = dataset[["target","proc_name"]]
dataset.head()

Unnamed: 0,target,proc_name
0,40,право заключения договора на поставку насосов ...
1,31,инструмент для трубопрокатных станов хпт и хпт...
2,27,электрооборудование каталог hahsa flex
3,60,оказание услуг по восстановлению работоспособн...
4,128,"геодезические комплектующие и аксессуары,согла..."


In [12]:
dataset.shape

(212090, 2)

In [14]:
len(set(dataset.values[:, 0]))

146

In [17]:
lens = []
for i in range (len(dataset['proc_name'])):
    lens.append(len(dataset['proc_name'][i]))

In [20]:
print ('Average length of a text is', sum(lens) // len(lens))
print ('Maximum length of a text is', max(lens))

Average length of a text is 164
Maximum length of a text is 6168


In [159]:
train_mask = np.random.rand(len(dataset), ) < 0.8
dataset_train = dataset[train_mask]
dataset_test = dataset[~train_mask]

In [160]:
dataset_train.head()

Unnamed: 0,target,proc_name
0,40,право заключения договора на поставку насосов ...
2,27,электрооборудование каталог hahsa flex
3,60,оказание услуг по восстановлению работоспособн...
4,128,"геодезические комплектующие и аксессуары,согла..."
5,39,услуги по организации транспортно-экспедиционн...


In [161]:
len(dataset_train), len(dataset_test)

(169739, 42351)

### RNN  

In [24]:
class RNNclassifier(torch.nn.Module):
    @property
    def device(self):
        return next(self.parameters()).device
    def __init__(self, vocab_dim, output_dim, emb_dim = 10, hidden_dim = 10,
                 num_layers = 3, bidirectional = False, p=0.7):
        super(RNNclassifier, self).__init__()
        self.embedding = torch.nn.Embedding(vocab_dim, emb_dim)
        self.encoder = torch.nn.LSTM(emb_dim, hidden_dim, num_layers,
                                     bidirectional=bidirectional,
                                     batch_first=True, dropout=p)
        self.linear = torch.nn.Linear(
            2*num_layers*int(bidirectional + 1)*hidden_dim,
            output_dim)
    def forward(self, input):
        input = self.embedding(input)
        _, (h, c) = self.encoder(input)
        act = torch.cat([h, c], dim=0).transpose(0, 1)
        act = act.reshape(len(input), -1)
        return self.linear(act)

### Tokenization, normalization and dictionary of words

In [162]:
stopwords_ru = stopwords.words("russian")
morph = MorphAnalyzer()

In [137]:
class Tokenizer(object):
    def __init__(self, tokenizer = RegexpTokenizer('[а-яА-Я]+|[^\w\s\d,."!*«»/?№:=+();@-–]')):
        self.tokenizer = tokenizer

    def get_dictionary_and_normalized_sentences(self,sentences):
        word_to_ind = {'[PAD]': 0, '[UNK]': 1, '[CLS]': 3, '[SEP]': 4}
        normalized_sentences = []
        for sent in tqdm(sentences):
            n_sent = []
            for word in self.tokenizer.tokenize(sent):
                word = morph.normal_forms(word)[0]
                if (len(word)>3) and (word not in stopwords_ru):
                    if (word not in word_to_ind):
                        word_to_ind[word] = word_to_ind.__len__()
                    n_sent.append(word)
            normalized_sentences.append(n_sent)
        return word_to_ind, normalized_sentences

    def get_normalized_sentences(self,sentences):
        normalized_sentences = []
        for sent in tqdm(sentences):
            n_sent = []
            for word in self.tokenizer.tokenize(sent):
                word = morph.normal_forms(word)[0]
                if (len(word)>3) and (word not in stopwords_ru):
                    n_sent.append(word)
            normalized_sentences.append(n_sent)
        return normalized_sentences

    def __call__(self, sentences, train = True, max_length = 50, pad_to_max_length = False):
        if train:
            self.word_to_ind, tokens = self.get_dictionary_and_normalized_sentences(sentences)
            self.train_tokens = tokens
        else:
            tokens = self.get_normalized_sentences(sentences)
            self.test_tokens = tokens
        if not pad_to_max_length:
            max_length = min(max_length, max(map(len, tokens)))
        tokens = [['[CLS]']+s+['[SEP]'] + ['[PAD]']*(max_length-len(s)) \
                  if len(s) < max_length \
                  else ['[CLS]']+s[:max_length]+['[SEP]'] \
                  for s in tokens ]
        ids = [[self.word_to_ind.get(w, self.word_to_ind['[UNK]']) for w in sent] for sent in tokens]
        return torch.tensor(ids)

In [163]:
tokenizer = Tokenizer()

In [164]:
%%time
train_data_sent = tokenizer(dataset_train.values[:, 1])

  0%|          | 0/169739 [00:00<?, ?it/s]

CPU times: user 16min 24s, sys: 4.77 s, total: 16min 28s
Wall time: 16min 41s


In [165]:
dataset_train.values[7, 1],tokenizer.train_tokens[7],train_data_sent[7]

('револьверная головка для токарного станка cke 6163z (китай)\r\nмодель ак21 150*4в с двигателем',
 ['револьверный',
  'головка',
  'токарный',
  'станок',
  'китай',
  'модель',
  'двигатель'],
 tensor([ 3, 53, 54, 55, 56, 57, 58, 59,  4,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0]))

In [166]:
len(tokenizer.word_to_ind)

40974

In [167]:
%%time
test_data_sent = tokenizer(dataset_test.values[:, 1], train = False)

  0%|          | 0/42351 [00:00<?, ?it/s]

CPU times: user 4min 1s, sys: 1.2 s, total: 4min 2s
Wall time: 4min 5s


In [189]:
dataset_test.values[3000, 1],tokenizer.test_tokens[3000],test_data_sent[3000]

('выполнение работ по объекту: «строительство железнодорожного пути станция  дунаевская – станция каракан» 1 этап для нужд ооо «тэк «мереть»\r\nзакупка №2018-01-29',
 ['выполнение',
  'работа',
  'объект',
  'строительство',
  'железнодорожный',
  'путь',
  'станция',
  'дунаевский',
  'станция',
  'каракан',
  'этап',
  'нужда',
  'мереть',
  'закупка'],
 tensor([    3,   120,   123,   124,   192,   863,   462,   729, 29487,   729,
         29488,   190,    36,  8417,    31,     4,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0]))

In [169]:
dataset_train_pt = torch.utils.data.TensorDataset(
    train_data_sent, torch.tensor(dataset_train.values[:, 0].tolist()).long())
dataset_test_pt = torch.utils.data.TensorDataset(
    test_data_sent, torch.tensor(dataset_test.values[:, 0].tolist()).long())

### Model initialization and training

In [170]:
config = dict()
config['vocab_dim'] = len(tokenizer.word_to_ind)
config['output_dim'] = len(set(dataset.values[:, 0]))
config['emb_dim'] = 100
config['hidden_dim'] = 50
config['num_layers'] = 10
config['bidirectional'] = False
config['p'] = 0.7

model = RNNclassifier(**config)
_ = model.to(device)

In [171]:
loss_function = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam

In [172]:
%%time
trainer(count_of_epoch=10,
        batch_size=64,
        dataset=dataset_train_pt,
        model=model,
        loss_function=loss_function,
        optimizer = optimizer,
        lr=0.001,
        callback=None)

epoch:   0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/2653 [00:00<?, ?it/s]

  0%|          | 0/2653 [00:00<?, ?it/s]

  0%|          | 0/2653 [00:00<?, ?it/s]

  0%|          | 0/2653 [00:00<?, ?it/s]

  0%|          | 0/2653 [00:00<?, ?it/s]

  0%|          | 0/2653 [00:00<?, ?it/s]

  0%|          | 0/2653 [00:00<?, ?it/s]

  0%|          | 0/2653 [00:00<?, ?it/s]

  0%|          | 0/2653 [00:00<?, ?it/s]

  0%|          | 0/2653 [00:00<?, ?it/s]

CPU times: user 3min 12s, sys: 2.37 s, total: 3min 14s
Wall time: 3min 21s


### Сhecking model quality on test dataset

In [173]:
batch_generator = torch.utils.data.DataLoader(dataset=dataset_test_pt,
                                              batch_size=64,
                                              pin_memory=True)

pred = []
real = []
model.eval()
for it, (x_batch, y_batch) in enumerate(batch_generator):
    x_batch = x_batch.to(device)
    with torch.no_grad():
        output = model(x_batch)

    pred.extend(torch.argmax(output, dim=-1).cpu().numpy().tolist())
    real.extend(y_batch.cpu().numpy().tolist())

print(classification_report(real, pred))

              precision    recall  f1-score   support

           0       0.71      0.75      0.73       274
           1       0.79      0.55      0.65        20
           2       0.30      0.19      0.23        16
           3       0.62      0.37      0.46        63
           4       0.57      0.89      0.70        18
           5       0.64      0.66      0.65       592
           6       0.55      0.26      0.35        69
           7       0.64      0.29      0.40        24
           8       0.50      0.28      0.36        25
           9       0.64      0.54      0.58       100
          10       0.59      0.91      0.71        33
          11       1.00      0.25      0.40         4
          12       0.28      0.26      0.27        74
          13       0.71      0.40      0.51       219
          14       0.74      0.82      0.78       282
          15       0.68      0.87      0.76       115
          16       0.73      0.76      0.74       739
          17       0.72    

In [174]:
accuracy = round(accuracy_score(real,pred),4)
accuracy

0.7638