# Загрузка данных

In [1]:
import json
import re
from joblib import delayed, Parallel

from razdel import sentenize, tokenize
from bpe import Encoder
import seaborn as sns
import matplotlib.pyplot as plt
from tqdm import tqdm_notebook as tqdm
import numpy as np

In [2]:
def tokenize_(text):
    tokens = tokenize(text)
    return [_.text for _ in tokens]
    
def remove_tags(text):
    text  = re.sub("<.*?>", "", text).replace('\n', ' ').replace('&nbsp;', ' ').replace('&mdash;', ' ')
    return text

In [4]:
fn = 'data/ria.json'
data = [json.loads(line) for line in tqdm(open(fn, 'r', encoding='UTF-8'))] 
texts = [_['title'] + '. ' + _['text']  for _ in tqdm(data)]

HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

KeyboardInterrupt: 

In [None]:
sents = Parallel(n_jobs=4)(delayed(remove_tags)(_) for _ in tqdm(texts))

In [11]:
with open('data/text.txt', 'w', encoding='UTF-8') as f:
    f.write('\n'.join(sents))

In [3]:
VOCAB_SIZE = 30000

encoder = Encoder(VOCAB_SIZE, word_tokenizer=tokenize_)  # params chosen for demonstration purposes
encoder.fit(sents)
encoder.save('bpe_enc')

NameError: name 'sents' is not defined

In [4]:
VOCAB_SIZE = 30000

encoder = Encoder(VOCAB_SIZE, word_tokenizer=tokenize_) 

encoder = encoder.load('bpe_enc')

# Генератор

In [5]:
import traceback

import tensorflow as tf

In [36]:
file = open('data/text.txt', 'r', encoding='UTF-8')
list(encoder.transform([next(file).strip()]))[0]

[755,
 228,
 2,
 139,
 3053,
 24000,
 24378,
 24232,
 24067,
 24001,
 4,
 63,
 18,
 5821,
 2,
 12,
 24000,
 14477,
 24030,
 1960,
 24001,
 3,
 36,
 2,
 541,
 462,
 9,
 16,
 15,
 3,
 1163,
 287,
 18,
 2501,
 228,
 2,
 139,
 1599,
 63,
 3053,
 13648,
 18,
 5821,
 4,
 11290,
 4,
 24000,
 30,
 24073,
 246,
 24052,
 24124,
 2075,
 24023,
 24001,
 1189,
 19654,
 2,
 12,
 658,
 24000,
 14477,
 24030,
 4094,
 24073,
 24001,
 2,
 104,
 4,
 321,
 292,
 1937,
 1152,
 66,
 653,
 7,
 804,
 1154,
 635,
 239,
 5,
 4261,
 5616,
 22819,
 5,
 13,
 22819,
 24000,
 24586,
 24676,
 25077,
 16645,
 24001,
 459,
 1001,
 24000,
 10671,
 24631,
 11016,
 6021,
 24001,
 14,
 2,
 24000,
 17089,
 24122,
 24241,
 24187,
 24204,
 24023,
 24001,
 581,
 2599,
 2,
 24000,
 23771,
 4094,
 24212,
 223,
 24105,
 24001,
 136,
 2118,
 27,
 841,
 998,
 24000,
 24133,
 24078,
 24084,
 24145,
 24001,
 24000,
 24035,
 12268,
 24090,
 24001,
 13,
 24000,
 24739,
 24667,
 24590,
 24602,
 24001,
 24000,
 24876,
 19156,
 24572,
 24

In [6]:
import random

In [7]:
maxlen = 60

def gen_train():
    file = open('data/text.txt', 'r', encoding='UTF-8')
    for k, line in enumerate(file):
        if k<150_000:
            continue
        try:
            idxs = list(encoder.transform([line.strip()]))[0]
        except:
            print(traceback.format_exc())
            continue
        len_idxs = len(idxs)
        c_step = len_idxs//maxlen
        if c_step>1:
            c_step = random.randint(1, c_step-1)        
        for i in range(c_step):
            zero_idxs = np.random.choice(list(range(4, maxlen-2)), 3)
            input_seq = np.array(idxs[i*maxlen:(i+1)*maxlen-1])
            input_seq[zero_idxs] = 0
            output_seq = idxs[i*maxlen:(i+1)*maxlen]
            yield [0] + list(input_seq), maxlen, maxlen, output_seq


def parser(input_seq, len_seq, max_len, output_seq):
    return {'input_seq':input_seq, 'len_seq': len_seq, 'max_len': max_len}, output_seq

def input_fn_train(params, is_training):
    dataset = tf.data.Dataset.from_generator(lambda: gen_train(),
                                             (tf.int64,tf.int64,tf.int64, tf.int64),
                                              output_shapes=(tf.TensorShape([None]), 
                                                             tf.TensorShape([]),
                                                             tf.TensorShape([]),
                                                             tf.TensorShape([None])))
    if is_training:
        dataset = dataset.shuffle(1000)

    dataset = dataset.batch(params['batch_size'])
    dataset = dataset.map(parser)
    return dataset


In [8]:
# encoder
def build_model(features, params, is_training):
    
    # Embedding matrix
    bpe_emb_matrix = tf.get_variable('bpe_embedding_matrix',
                                 shape=[params['vocab_size'], params['emb_size']],
                                 dtype=tf.float32)

    def encode(x):
        # embedding
        embs = tf.nn.embedding_lookup(bpe_emb_matrix, x['input_seq'])
        # dropout
        dropout_emb = tf.layers.dropout(inputs=embs, 
                                        rate=0.1, 
                                        training=is_training)
        # lstm
        lstm_cell_1 = tf.nn.rnn_cell.GRUCell(250)
        outputs, final_states = tf.nn.dynamic_rnn(
            lstm_cell_1, dropout_emb, sequence_length=x['len_seq'], dtype=tf.float32)

        # for futher clf
        max_pool = tf.reduce_max(input_tensor=outputs, axis=1)
        mean_pool = tf.reduce_mean(input_tensor=outputs, axis=1)
        concat_pools = tf.concat((mean_pool, max_pool, final_states),1)
        
        # for lm
        dropout_lstm = tf.layers.dropout(inputs=outputs, 
                                        rate=0.1, 
                                        training=is_training)
        logits = tf.layers.dense(dropout_lstm, params['vocab_size'])
        
        
        return logits, concat_pools

    
    with tf.variable_scope('encoder'):
        logits, concat_pools = encode(features)
    
    return logits, concat_pools


In [None]:
def model_fn(features, labels, mode, params):
    
    is_training = (mode == tf.estimator.ModeKeys.TRAIN)
    
    with tf.variable_scope('model'):
        logits, concat_pools = build_model(features, params, is_training)
    
    weight_mask = tf.to_float(tf.sequence_mask(features['max_len']))
    loss = tf.contrib.seq2seq.sequence_loss(logits, labels, weight_mask)
    
    optimizer = tf.train.AdamOptimizer()
    global_step = tf.train.get_global_step()
    train_op = optimizer.minimize(loss, global_step=global_step)
    
    return tf.estimator.EstimatorSpec(mode, loss=loss, train_op=train_op)

tf.reset_default_graph()

model_params = {
    'vocab_size': VOCAB_SIZE,
    'emb_size': 100,
    'batch_size': 32,
    'num_epochs': 1
}
model_dir = 'ckpt/ulmfit_experiment_10'
config = tf.estimator.RunConfig(tf_random_seed=123,
                                model_dir=model_dir,
                                save_summary_steps=5,
                               save_checkpoints_steps=1000)

estimator = tf.estimator.Estimator(model_fn,
                                   params=model_params,
                                   config=config)
estimator.train(input_fn=lambda: input_fn_train(model_params, True))

INFO:tensorflow:Using config: {'_model_dir': 'ckpt/ulmfit_experiment_10', '_tf_random_seed': 123, '_save_summary_steps': 5, '_save_checkpoints_steps': 1000, '_save_checkpoints_secs': None, '_session_config': allow_soft_placement: true
graph_options {
  rewrite_options {
    meta_optimizer_iterations: ONE
  }
}
, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100, '_train_distribute': None, '_device_fn': None, '_protocol': None, '_eval_distribute': None, '_experimental_distribute': None, '_service': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x000001A1F1F86A58>, '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1}
Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
tf.py_func is deprecated in TF V2. Instead, use
    tf.py_function, w

INFO:tensorflow:loss = 4.316588, step = 7601 (151.638 sec)
INFO:tensorflow:global_step/sec: 0.659958
INFO:tensorflow:loss = 4.162794, step = 7701 (151.525 sec)
INFO:tensorflow:global_step/sec: 0.660054
INFO:tensorflow:loss = 4.048342, step = 7801 (151.503 sec)
INFO:tensorflow:global_step/sec: 0.65877
INFO:tensorflow:loss = 4.1756907, step = 7901 (151.798 sec)
INFO:tensorflow:Saving checkpoints for 8000 into ckpt/ulmfit_experiment_10\model.ckpt.
INFO:tensorflow:global_step/sec: 0.648661
INFO:tensorflow:loss = 4.0562077, step = 8001 (154.164 sec)
INFO:tensorflow:global_step/sec: 0.659199
INFO:tensorflow:loss = 4.0962, step = 8101 (151.699 sec)
INFO:tensorflow:global_step/sec: 0.659212
INFO:tensorflow:loss = 4.1359425, step = 8201 (151.696 sec)
INFO:tensorflow:global_step/sec: 0.65807
INFO:tensorflow:loss = 4.30689, step = 8301 (151.960 sec)
INFO:tensorflow:global_step/sec: 0.659706
INFO:tensorflow:loss = 4.0469546, step = 8401 (151.583 sec)
INFO:tensorflow:global_step/sec: 0.657238
INFO:

INFO:tensorflow:global_step/sec: 0.649614
INFO:tensorflow:loss = 3.6701477, step = 15001 (153.939 sec)
INFO:tensorflow:global_step/sec: 0.65919
INFO:tensorflow:loss = 3.883093, step = 15101 (151.701 sec)
INFO:tensorflow:global_step/sec: 0.659741
INFO:tensorflow:loss = 3.671018, step = 15201 (151.575 sec)
INFO:tensorflow:global_step/sec: 0.659962
INFO:tensorflow:loss = 4.0345635, step = 15301 (151.524 sec)
INFO:tensorflow:global_step/sec: 0.659186
INFO:tensorflow:loss = 3.8590765, step = 15401 (151.702 sec)
INFO:tensorflow:global_step/sec: 0.659963
INFO:tensorflow:loss = 3.7815099, step = 15501 (151.524 sec)
INFO:tensorflow:global_step/sec: 0.659854
INFO:tensorflow:loss = 3.4925869, step = 15601 (151.549 sec)
INFO:tensorflow:global_step/sec: 0.658909
INFO:tensorflow:loss = 3.6029932, step = 15701 (151.766 sec)
INFO:tensorflow:global_step/sec: 0.658939
INFO:tensorflow:loss = 3.6428533, step = 15801 (151.759 sec)
INFO:tensorflow:global_step/sec: 0.658156
INFO:tensorflow:loss = 3.8112009, 

INFO:tensorflow:global_step/sec: 0.660524
INFO:tensorflow:loss = 3.5700607, step = 22401 (151.395 sec)
INFO:tensorflow:global_step/sec: 0.659793
INFO:tensorflow:loss = 3.8627696, step = 22501 (151.563 sec)
INFO:tensorflow:global_step/sec: 0.658878
INFO:tensorflow:loss = 3.829917, step = 22601 (151.773 sec)
INFO:tensorflow:global_step/sec: 0.659446
INFO:tensorflow:loss = 3.621438, step = 22701 (151.642 sec)
INFO:tensorflow:global_step/sec: 0.660245
INFO:tensorflow:loss = 3.7067902, step = 22801 (151.459 sec)
INFO:tensorflow:global_step/sec: 0.659832
INFO:tensorflow:loss = 3.7158945, step = 22901 (151.553 sec)
