# Imports

In [None]:
! pip install -q maru razdel simple-elmo

In [1]:
# If notebook is used in colab
%tensorflow_version 1.x

TensorFlow 1.x selected.


In [2]:
import zipfile
import pickle
import os
import re

import nltk
import maru
import razdel
import pandas as pd
import numpy as np
import tensorflow as tf
from math import ceil
from os.path import join
from collections import namedtuple
from evaluate import evaluate
from sklearn.decomposition import PCA
from tqdm import tqdm_notebook as tqdm
from nltk.corpus import stopwords
nltk.download('stopwords')
stopwords_set = set(stopwords.words('russian'))

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [3]:
import input_pipeline
import model 
import utils
import preprocessor
import loader
import script

In [4]:
class HParams(namedtuple('hparams',
                         ['num_units',
                          'num_layers',
                          'num_classes',
                          'dropout_rate',
                          'learning_rate',
                          'batch_size',
                          'num_epochs',
                          'buffer_size',
                          'num_steps_to_eval',
                          'chkpts_dir'])):
  pass

hparams = HParams(num_units = 128,
                  num_layers = 3,
                  num_classes = 8,
                  dropout_rate = 0.3,
                  learning_rate = 3e-04,
                  batch_size = 64,
                  num_epochs = 20,
                  buffer_size = 512,
                  num_steps_to_eval = 10,
                  chkpts_dir = './model/chkpts')

# Load pretrained ELMo

In [None]:
! wget http://vectors.nlpl.eu/repository/20/199.zip

In [None]:
! mkdir ./pretrained_models
! mkdir ./pretrained_models/elmo
! mv 199.zip ./pretrained_models/elmo

In [None]:
zipped_file = zipfile.ZipFile('./pretrained_models/elmo/199.zip')
zipped_file.extractall('./pretrained_models/elmo')

# Data load

In [None]:
pattern = re.compile(r'[^а-яА-я0-9.,?!:; ё]')
analyzer = maru.get_analyzer(tagger='linear')
config = preprocessor.Config(regexp = pattern,
                            stopwords = stopwords_set, 
                            analyzer = analyzer,
                            with_pos_tag = False, 
                            remove_stop_words = False,
                            lemmatize = True,
                            tokenizer = razdel)
pipeline = preprocessor.Preprocessor(config)

pca_seq = PCA(n_components = 300)
pca_query = PCA(n_components = 300)
pca_cont = PCA(n_components = 300)

In [None]:
train = pd.read_csv('./data/train.tsv',
                   sep = '\t')

test = pd.read_csv('./data/test.csv',
                  sep = '\t')


In [None]:
train_loader = loader.TrainLoader(pipeline, pca_seq, pca_query, pca_cont)
train_q, train_c, train_s, train_lens, train_target = train_loader.load_dataset('./data/train.tsv', './pretrained_models/elmo')

In [None]:
dev_loader = loader.DevLoader(pipeline, train_loader.pca_seq, train_loader.pca_query, train_loader.pca_cont)
dev_q, dev_c, dev_s, dev_lens, dev_target = dev_loader.load_dataset('./data/test.csv', './pretrained_models/elmo')

In [None]:
pickle.dump(train_loader.pca_seq, open('./data/pca_seq.p', 'wb'))
pickle.dump(train_loader.pca_cont, open('./data/pca_cont.p', 'wb'))
pickle.dump(train_loader.pca_query, open('./data/pca_query.p', 'wb'))

# Fit

In [None]:
pickle.dump(train_s, open('./data/train_seq.p', 'wb'))
pickle.dump(dev_s, open('./data/test_seq.p', 'wb'))

pickle.dump(train_q, open('./data/train_queries.p', 'wb'))
pickle.dump(train_c, open('./data/train_contexts.p', 'wb'))

pickle.dump(dev_q, open('./data/test_queries.p', 'wb'))
pickle.dump(dev_c, open('./data/test_contexts.p', 'wb'))

pickle.dump(train_target, open('./data/train_target.p', 'wb'))
pickle.dump(dev_target, open('./data/test_target.p', 'wb'))

pickle.dump(train_lens, open('./data/train_lens.p', 'wb'))
pickle.dump(dev_lens, open('./data/test_lens.p', 'wb'))

In [None]:
train_graph = tf.Graph()
dev_graph = tf.Graph()

train_session = tf.Session(graph = train_graph)
dev_session = tf.Session(graph = dev_graph)

train_files = input_pipeline.FileObj('./data/train_seq.p',
                                    './data/train_queries.p',
                                    './data/train_contexts.p',
                                    './data/train_lens.p',
                                    './data/train_target.p')
dev_files = input_pipeline.FileObj('./data/test_seq.p',
                                  './data/test_queries.p',
                                  './data/test_contexts.p',
                                  './data/test_lens.p',
                                  './data/test_target.p')


with train_graph.as_default():
    train_iterator, train_total_num = input_pipeline.get_iterator(
                                                  regime = 'TRAIN',
                                                  buffer_size = hparams.buffer_size,
                                                  num_epochs = hparams.num_epochs,
                                                  batch_size = hparams.batch_size,
                                                  filesobj = train_files)                    
    train_model = model.Model(hparams,
                              train_iterator,
                              'TRAIN')
    train_vars_init_op = tf.global_variables_initializer()    
    tf.get_default_graph().finalize()
    
with dev_graph.as_default():
    dev_iterator, dev_total_num = input_pipeline.get_iterator(
                                                regime = 'DEV',
                                                buffer_size = None,
                                                num_epochs = 1,
                                                batch_size = hparams.batch_size,
                                                filesobj = dev_files)
    dev_model = model.Model(hparams,
                            dev_iterator,
                            'DEV')    
    tf.get_default_graph().finalize()

In [None]:
! rm -r './model'
! rm -r './logs'

rm: cannot remove './logs': No such file or directory


In [None]:
train_steps = utils.count_num_steps(hparams.num_epochs,
                                    train_total_num,
                                    hparams.batch_size)
eval_steps = utils.count_num_steps(1,
                                  dev_total_num,
                                  hparams.batch_size)
eval_count = hparams.num_steps_to_eval

train_session.run(train_iterator.initializer)
train_session.run(train_vars_init_op)
aris = []
                
with tqdm(total = train_steps) as prog:
    for step in range(train_steps):
        _, res = train_model.train(train_session)
        if step % 20 == 0:
          print('train loss at step {} = {}'.format(step, res.train_loss))
        if step % eval_count == 0:
            ari, current_path = utils.deval(
                                  train_model,
                                  train_session,
                                  hparams.chkpts_dir,
                                  step,
                                  dev_model,
                                  dev_session,
                                  dev_iterator,
                                  eval_steps,
                                  test)
            aris.append(ari)
            if ari >= max(aris):
              if step == 0:
                os.system('mkdir ./model/best_shot/')
              utils.clear_dir('./model/best_shot')
              utils.copy_files(current_path)
              best_shot_path = current_path   
            print('eval ari at step {} = {}'.format(step, ari))
        prog.update(1)         

# Evaluate with prepared data

In [None]:
script.run()

INFO:tensorflow:Restoring parameters from ./best_shot/chkpts-130


2021-02-08 19:36:30,621 : INFO : Restoring parameters from ./best_shot/chkpts-130


HBox(children=(FloatProgress(value=0.0, max=58.0), HTML(value='')))


score on the dev set = 0.15733293562616749


# Evaluate from scratch

In [10]:
pattern = re.compile(r'[^а-яА-я0-9.,?!:; ё]')
analyzer = maru.get_analyzer(tagger='linear')
config = preprocessor.Config(regexp = pattern,
                            stopwords = stopwords_set, 
                            analyzer = analyzer,
                            with_pos_tag = False, 
                            remove_stop_words = False,
                            lemmatize = True,
                            tokenizer = razdel)
pipeline = preprocessor.Preprocessor(config)

pca_seq = pickle.load(open('./data/pca_seq.p', 'rb'))
pca_cont = pickle.load(open('./data/pca_cont.p', 'rb'))
pca_query = pickle.load(open('./data/pca_query.p', 'rb'))

test = pd.read_csv('./data/test.csv',
                  sep = '\t')

In [None]:
dev_loader = loader.DevLoader(pipeline, pca_seq, pca_query, pca_cont)
dev_q, dev_c, dev_s, dev_lens, dev_target = dev_loader.load_dataset('./data/test.csv', './pretrained_models/elmo')

In [7]:
pickle.dump(dev_s, open('./data/test_seq.p', 'wb'))
pickle.dump(dev_q, open('./data/test_queries.p', 'wb'))
pickle.dump(dev_c, open('./data/test_contexts.p', 'wb'))
pickle.dump(dev_target, open('./data/test_target.p', 'wb'))
pickle.dump(dev_lens, open('./data/test_lens.p', 'wb'))

In [8]:
dev_files = input_pipeline.FileObj('./data/test_seq.p',
                                  './data/test_queries.p',
                                  './data/test_contexts.p',
                                  './data/test_lens.p',
                                  './data/test_target.p')

In [11]:
dev_graph = tf.Graph()
        
with dev_graph.as_default():
    dev_iterator, dev_total_num = input_pipeline.get_iterator(
                                              regime = 'DEV',
                                              buffer_size = None,
                                              num_epochs = 1,
                                              batch_size = hparams.batch_size,
                                              filesobj = dev_files)
    dev_model = model.Model(hparams,
                            dev_iterator,
                            'DEV')    
    tf.get_default_graph().finalize()

dev_session = tf.Session(graph = dev_graph)
eval_steps = utils.count_num_steps(1,
                                  dev_total_num,
                                  hparams.batch_size)
ari = utils.eval_from_path('./best_shot/chkpts-130',
                    dev_model,
                    dev_session,
                    dev_iterator,
                    eval_steps,
                    test)
print('score on the dev set = {}'.format(ari))

INFO:tensorflow:Restoring parameters from ./best_shot/chkpts-130


2021-02-08 19:53:44,830 : INFO : Restoring parameters from ./best_shot/chkpts-130


HBox(children=(FloatProgress(value=0.0, max=58.0), HTML(value='')))


score on the dev set = 0.15733293562616749
