# Загрузка данных

In [1]:
import json
import re
from joblib import delayed, Parallel

from razdel import sentenize, tokenize
from bpe import Encoder
import seaborn as sns
import matplotlib.pyplot as plt
from tqdm import tqdm_notebook as tqdm
import pandas as pd

In [2]:
def tokenize_(text):
    tokens = tokenize(text)
    return [_.text for _ in tokens]
    
def remove_tags(text):
    text  = re.sub("<.*?>", "", text).replace('\n', ' ').replace('&nbsp;', ' ').replace('&mdash;', ' ')
    return text

In [3]:
df = pd.read_csv('data/lenta.csv')
df['title'] = df['title'].apply(lambda x: str(x).replace('\xa0', ' '))
df['full_text'] = df['title'].map(lambda x: str(x) + ' ') + \
                df['text'].map(lambda x: str(x).replace('\n', ' '))
df[['full_text', 'tags']].to_csv('data/lenta_text.txt', index=False, sep='|')

In [4]:
VOCAB_SIZE = 30000
encoder = Encoder(VOCAB_SIZE, word_tokenizer=tokenize_) 
encoder = encoder.load('bpe_enc')

In [18]:
list(encoder.tokenize(df['full_text'].values[1]))[:30], df['tags'].values[1]

(['австрия',
  'не',
  'представила',
  'доказательств',
  'вины',
  'российских',
  'биатлонистов',
  '__sow',
  'ав',
  'ст',
  'ри',
  'йс',
  'ки',
  'е',
  '__eow',
  'правоохранительные',
  'органы',
  'не',
  'представили',
  'доказательств',
  'нарушения',
  'российскими',
  '__sow',
  'би',
  'ат',
  'ло',
  'ни',
  'ст',
  'ам',
  'и'],
 'Зимние виды')

In [27]:
tags = df['tags'].value_counts()
tags = tags[tags>100]
tags = tags[tags<34_000]

In [27]:
f = open('data/lenta_text.txt', 'r', encoding='UTF-8')
len([1 for _ in f])

754625

# Модель

In [5]:
import tensorflow as tf

In [6]:
maxlen = 60

def gen_train(params):
    file = open('data/lenta_text.txt', 'r', encoding='UTF-8')
    for k, line in enumerate(file):
        if k>params['max_texts']:
            break
        try:
            idxs = list(encoder.transform([line.strip().split('|')[0]]))[0]
        except:
            print(traceback.format_exc())
            continue
        len_idxs = len(idxs)
        if len_idxs<maxlen:
            idxs = idxs + [0]*(maxlen-len_idxs)
        yield [0] + idxs[:maxlen-1], maxlen, maxlen, idxs[:maxlen]


def parser(input_seq, len_seq, max_len, output_seq):
    return {'input_seq':input_seq, 'len_seq': len_seq, 'max_len': max_len}, output_seq

def input_fn_train(params, is_training):
    dataset = tf.data.Dataset.from_generator(lambda: gen_train(params),
                                             (tf.int64,tf.int64,tf.int64,tf.int64),
                                              output_shapes=(tf.TensorShape([None]), 
                                                             tf.TensorShape([]),
                                                             tf.TensorShape([]),
                                                             tf.TensorShape([None])))
    if is_training:
        dataset = dataset.repeat(count=params['num_epochs'])
        dataset = dataset.shuffle(10000)

    dataset = dataset.batch(params['batch_size'])
    dataset = dataset.map(parser)
    return dataset


In [7]:
# encoder
def build_model(features, params, is_training):
    
    # Embedding matrix
    bpe_emb_matrix = tf.get_variable('bpe_embedding_matrix',
                                 shape=[params['vocab_size'], params['emb_size']],
                                 dtype=tf.float32)

    def encode(x):
        # embedding
        embs = tf.nn.embedding_lookup(bpe_emb_matrix, x['input_seq'])
        # dropout
        dropout_emb = tf.layers.dropout(inputs=embs, 
                                        rate=0.1, 
                                        training=is_training)
        # lstm
        lstm_cell_1 = tf.nn.rnn_cell.GRUCell(250)
        outputs, final_states = tf.nn.dynamic_rnn(
            lstm_cell_1, dropout_emb, sequence_length=x['len_seq'], dtype=tf.float32)

        # for futher clf
        max_pool = tf.reduce_max(input_tensor=outputs, axis=1)
        mean_pool = tf.reduce_mean(input_tensor=outputs, axis=1)
        concat_pools = tf.concat((mean_pool, max_pool, final_states),1)
        
        # for lm
        dropout_lstm = tf.layers.dropout(inputs=outputs, 
                                        rate=0.1, 
                                        training=is_training)
        logits = tf.layers.dense(dropout_lstm, params['vocab_size'])
        
        
        return logits, concat_pools

    
    with tf.variable_scope('encoder'):
        logits, concat_pools = encode(features)
    
    return logits, concat_pools


In [8]:
from tensorflow.python.framework import ops
from tensorflow.python.ops import math_ops

def triangular_learning_rate(global_step, max_steps,
                             max_lr=0.001,
                             ratio=32.,
                             cut_frac=0.1,
                            name=None):

    if global_step is None:
        raise ValueError("global_step is required for cyclic_learning_rate.")
    with ops.name_scope(name, "TriangularLearningRate",
                      [global_step]) as name:
        max_lr = ops.convert_to_tensor(max_lr)
        dtype = max_lr.dtype
        global_step = math_ops.cast(global_step, dtype)
        ratio = math_ops.cast(ratio, dtype)
        cut_frac = math_ops.cast(cut_frac, dtype)
        max_steps = math_ops.cast(max_steps, dtype)
        
        def triangular_lr():
            cut = math_ops.floor(math_ops.multiply(max_steps, cut_frac))
            # 1
            p1 = math_ops.divide(global_step, cut)
            # 2
            up = math_ops.subtract(global_step, cut)
            right = math_ops.subtract(math_ops.divide(1., cut_frac), 1.)
            down = math_ops.multiply(cut, right)
            p2 = math_ops.subtract(1., math_ops.divide(up, down))
            #
            p = tf.cond(tf.math.greater(global_step, cut), lambda: p2, lambda: p1)
            up = math_ops.add(1., math_ops.multiply(p, math_ops.subtract(ratio, 1.)))
            lr = math_ops.multiply(max_lr, math_ops.divide(up, ratio))
            return lr
        tlr = triangular_lr()
        return tlr

In [9]:
def model_fn(features, labels, mode, params):
    
    is_training = (mode == tf.estimator.ModeKeys.TRAIN)
    
    with tf.variable_scope('model'):
        logits, concat_pools = build_model(features, params, is_training)
     
    # TRANSFER LEARNING
    keys = ['model/encoder']
    tf.train.init_from_checkpoint('ckpt/ulmfit_experiment_10', {k+'/': k+'/' for k in keys})
    
    weight_mask = tf.to_float(tf.sequence_mask(features['max_len']))
    loss = tf.contrib.seq2seq.sequence_loss(logits, labels, weight_mask)
    
    # Learning rate
    global_step = tf.train.get_global_step()
    lr = triangular_learning_rate(global_step, params['max_step'])
    tf.summary.scalar('learning_rate', lr)
    
    
    lr3 = lr
    lr2 = math_ops.divide(lr3, 2.6)
    lr1 = math_ops.divide(lr2, 2.6)
    
    # discr finetune   
    var_list_1 = [tf.trainable_variables()[0]]
    var_list_2 = tf.trainable_variables()[1:-2]
    var_list_3 = tf.trainable_variables()[-2:]
    opt1 = tf.train.AdadeltaOptimizer(learning_rate=lr1)
    opt2 = tf.train.AdadeltaOptimizer(learning_rate=lr2)
    opt3 = tf.train.AdadeltaOptimizer(learning_rate=lr3)
    
    grads = tf.gradients(loss, var_list_1 + var_list_2 + var_list_3)
    grads1 = grads[:len(var_list_1)]
    grads2 = grads[len(var_list_1):len(var_list_1)+len(var_list_2)]
    grads3 = grads[len(var_list_1)+len(var_list_2):]
    train_op1 = opt1.apply_gradients(zip(grads1, var_list_1))
    train_op2 = opt2.apply_gradients(zip(grads2, var_list_2))
    train_op3 = opt2.apply_gradients(zip(grads3, var_list_3))
    
    update_global_step = tf.assign(global_step, global_step + 1, name = 'update_global_step')

    
    train_op = tf.group(train_op1, train_op2, train_op3, update_global_step)

    
    
    return tf.estimator.EstimatorSpec(mode, loss=loss, train_op=train_op)

tf.reset_default_graph()


max_texts = 100_000
bs = 16
ep = 1
model_params = {
    'vocab_size': VOCAB_SIZE,
    'emb_size': 100,
    'batch_size': bs,
    'max_texts': max_texts,
    'num_epochs': ep,
    'max_step': ep*max_texts//bs + 1
}
model_dir = 'ckpt/ulmfit_st2_experiment_15'


config = tf.estimator.RunConfig(tf_random_seed=123,
                                model_dir=model_dir,
                                save_summary_steps=5,
                               save_checkpoints_steps=1000)

estimator = tf.estimator.Estimator(model_fn,
                                   params=model_params,
                                   config=config)
estimator.train(input_fn=lambda: input_fn_train(model_params, True))

INFO:tensorflow:Using config: {'_model_dir': 'ckpt/ulmfit_st2_experiment_15', '_tf_random_seed': 123, '_save_summary_steps': 5, '_save_checkpoints_steps': 1000, '_save_checkpoints_secs': None, '_session_config': allow_soft_placement: true
graph_options {
  rewrite_options {
    meta_optimizer_iterations: ONE
  }
}
, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100, '_train_distribute': None, '_device_fn': None, '_protocol': None, '_eval_distribute': None, '_experimental_distribute': None, '_service': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x000001DC1F3E13C8>, '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1}
Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
tf.py_func is deprecated in TF V2. Instead, use
    tf.py_functio

INFO:tensorflow:loss = 8.946537, step = 3701 (74.373 sec)
INFO:tensorflow:global_step/sec: 1.34542
INFO:tensorflow:loss = 8.87369, step = 3801 (74.327 sec)
INFO:tensorflow:global_step/sec: 1.34398
INFO:tensorflow:loss = 8.96726, step = 3901 (74.406 sec)
INFO:tensorflow:Saving checkpoints for 4000 into ckpt/ulmfit_st2_experiment_15\model.ckpt.
INFO:tensorflow:global_step/sec: 1.29752
INFO:tensorflow:loss = 8.997647, step = 4001 (77.069 sec)
INFO:tensorflow:global_step/sec: 1.33828
INFO:tensorflow:loss = 9.042837, step = 4101 (74.723 sec)
INFO:tensorflow:global_step/sec: 1.33927
INFO:tensorflow:loss = 9.0319805, step = 4201 (74.668 sec)
INFO:tensorflow:global_step/sec: 1.33938
INFO:tensorflow:loss = 8.885153, step = 4301 (74.661 sec)
INFO:tensorflow:global_step/sec: 1.33831
INFO:tensorflow:loss = 8.963545, step = 4401 (74.722 sec)
INFO:tensorflow:global_step/sec: 1.3432
INFO:tensorflow:loss = 8.836092, step = 4501 (74.449 sec)
INFO:tensorflow:global_step/sec: 1.34304
INFO:tensorflow:loss

<tensorflow_estimator.python.estimator.estimator.Estimator at 0x1dc1f3e1208>