# Imports

In [1]:
! pip install -q maru razdel simple-elmo deeppavlov transformers sentence-transformers

In [None]:
! pip install --upgrade transformers

In [1]:
%tensorflow_version 1.x

TensorFlow 1.x selected.


In [50]:
import pandas as pd
import io
import gzip
import pathlib
import urllib.request
import zipfile
import os
import numpy as np
import maru
import razdel
import re
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
stopwords_set = set(stopwords.words('russian'))
import pickle
from scipy.stats import randint, uniform
import zipfile
from math import ceil
from tqdm import tqdm_notebook as tqdm
from os.path import join
import pickle as p
from collections import namedtuple


import tensorflow as tf
from gensim.models import KeyedVectors
from sklearn.cluster import (KMeans, AffinityPropagation, AgglomerativeClustering,
                            Birch, SpectralClustering)
from sklearn.model_selection import ParameterSampler
from sklearn.decomposition import PCA


from evaluate import evaluate

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


# Data load

In [125]:
from preprocessor import Preprocessor, Config
from loader import TrainLoader, DevLoader

pattern = re.compile(r'[^а-яА-я0-9.,?!:; ё]')
analyzer = maru.get_analyzer(tagger='linear')
config = Config(regexp = pattern,
                 stopwords = stopwords_set, 
                 analyzer = analyzer,
                 with_pos_tag = False, 
                 remove_stop_words = False,
                 lemmatize = True,
                 tokenizer = razdel)

preprocessor = Preprocessor(config)

pca_seq = PCA(n_components = 300)
pca_query = PCA(n_components = 300)
pca_cont = PCA(n_components = 300)

In [65]:
train = pd.read_csv('./data/train.tsv',
                   sep = '\t')

test = pd.read_csv('./data/test.csv',
                  sep = '\t')


In [66]:
train_loader = TrainLoader(preprocessor, pca_seq, pca_query, pca_cont)
train_q, train_c, train_s, train_lens, train_target = train_loader.load_dataset('./data/train.tsv', './pretrained_models/elmo')

In [67]:
dev_loader = DevLoader(preprocessor, train_loader.pca_seq, train_loader.pca_query, train_loader.pca_cont)
dev_q, dev_c, dev_s, dev_lens, dev_target = dev_loader.load_dataset('./data/test.csv', './pretrained_models/elmo')

In [124]:
p.dump(train_loader.pca_seq, open('./data/pca_seq.p', 'wb'))
p.dump(train_loader.pca_cont, open('./data/pca_cont.p', 'wb'))
p.dump(train_loader.pca_query, open('./data/pca_query.p', 'wb'))

# Fit

In [80]:
pickle.dump(train_s, open('./data/train_seq.p', 'wb'))
pickle.dump(dev_s, open('./data/test_seq.p', 'wb'))

pickle.dump(train_q, open('./data/train_queries.p', 'wb'))
pickle.dump(train_c, open('./data/train_contexts.p', 'wb'))

pickle.dump(dev_q, open('./data/test_queries.p', 'wb'))
pickle.dump(dev_c, open('./data/test_contexts.p', 'wb'))

pickle.dump(train_target, open('./data/train_target.p', 'wb'))
pickle.dump(dev_target, open('./data/test_target.p', 'wb'))

pickle.dump(train_lens, open('./data/train_lens.p', 'wb'))
pickle.dump(dev_lens, open('./data/test_lens.p', 'wb'))

In [102]:
from input_pipeline import get_iterator, FileObj
from model import Model
from utils import count_num_steps, deval, clear_dir, copy_files

In [103]:
class HParams(namedtuple('hparams',
                         ['num_units',
                          'num_layers',
                          'num_classes',
                          'dropout_rate',
                          'learning_rate',
                          'batch_size',
                          'num_epochs',
                          'buffer_size',
                          'num_steps_to_eval',
                          'chkpts_dir'])):
  pass


hparams = HParams(num_units = 128,
                  num_layers = 3,
                  num_classes = 8,
                  dropout_rate = 0.3,
                  learning_rate = 3e-04,
                  batch_size = 64,
                  num_epochs = 20,
                  buffer_size = 512,
                  num_steps_to_eval = 10,
                  chkpts_dir = './model/chkpts')

In [104]:
### As was said earlier we use separate graphs and session
### for training, evaluation 

train_graph = tf.Graph()
dev_graph = tf.Graph()

train_session = tf.Session(graph = train_graph)
dev_session = tf.Session(graph = dev_graph)

train_files = FileObj('./data/train_seq.p',
                      './data/train_queries.p',
                      './data/train_contexts.p',
                      './data/train_lens.p',
                      './data/train_target.p')
dev_files = FileObj('./data/test_seq.p',
                      './data/test_queries.p',
                      './data/test_contexts.p',
                      './data/test_lens.p',
                      './data/test_target.p')


with train_graph.as_default():
    
    train_iterator, train_total_num = get_iterator(regime = 'TRAIN',
                                                  buffer_size = hparams.buffer_size,
                                                  num_epochs = hparams.num_epochs,
                                                  batch_size = hparams.batch_size,
                                                   filesobj = train_files)
                                      
    train_model = Model(hparams,
                        train_iterator,
                        'TRAIN')
    
    train_vars_init_op = tf.global_variables_initializer()    
    
    tf.get_default_graph().finalize()
    
    
with dev_graph.as_default():
    
    dev_iterator, dev_total_num = get_iterator(regime = 'DEV',
                                              buffer_size = None,
                                              num_epochs = 1,
                                              batch_size = hparams.batch_size,
                                               filesobj = dev_files)

    dev_model = Model(hparams,
                      dev_iterator,
                      'DEV')    
    
    tf.get_default_graph().finalize()


In [105]:
! rm -r './model'
! rm -r './logs'

rm: cannot remove './model': No such file or directory
rm: cannot remove './logs': No such file or directory


In [None]:
train_steps = count_num_steps(hparams.num_epochs,
                              train_total_num,
                              hparams.batch_size)

eval_steps = count_num_steps(1,
                            dev_total_num,
                            hparams.batch_size)

eval_count = hparams.num_steps_to_eval


train_session.run(train_iterator.initializer)

train_session.run(train_vars_init_op)

aris = []
                

with tqdm(total = train_steps) as prog:
    for step in range(train_steps):
        _, res = train_model.train(train_session)

        if step % 20 == 0:
          print('train loss at step {} = {}'.format(step, res.train_loss))
        if step % eval_count == 0:
            
            ari, current_path = deval(
                    train_model,
                    train_session,
                    hparams.chkpts_dir,
                    step,
                    dev_model,
                    dev_session,
                    dev_iterator,
                    eval_steps,
                    test)
            aris.append(ari)

            if ari >= max(aris):

              if step == 0:

                os.system('mkdir ./model/best_shot/')

              clear_dir('./model/best_shot')

              copy_files(current_path)

              best_shot_path = current_path   
            
            print('eval ari at step {} = {}'.format(step, ari))
        prog.update(1)         

# Evaluate with prepared data

In [121]:
from script import run

In [122]:
run()

INFO:tensorflow:Restoring parameters from ./best_shot/chkpts-130


2021-02-03 16:15:10,262 : INFO : Restoring parameters from ./best_shot/chkpts-130


HBox(children=(FloatProgress(value=0.0, max=58.0), HTML(value='')))


score on the dev set = 0.15733293562616749


# Evaluate from scratch

In [126]:
pca_seq = p.load(open('./data/pca_seq.p', 'rb'))
pca_cont = p.load(open('./data/pca_cont.p', 'rb'))
pca_query = p.load(open('./data/pca_query.p', 'rb'))

In [None]:
dev_loader = DevLoader(preprocessor, pca_seq, pca_query, pca_cont)
dev_q, dev_c, dev_s, dev_lens, dev_target = dev_loader.load_dataset('./data/test.csv', './pretrained_models/elmo')

In [129]:
! mkdir ./data1

In [130]:
pickle.dump(dev_s, open('./data1/test_seq.p', 'wb'))

pickle.dump(dev_q, open('./data1/test_queries.p', 'wb'))
pickle.dump(dev_c, open('./data1/test_contexts.p', 'wb'))

pickle.dump(dev_target, open('./data1/test_target.p', 'wb'))

pickle.dump(dev_lens, open('./data1/test_lens.p', 'wb'))

In [132]:
dev_files = FileObj('./data1/test_seq.p',
                      './data1/test_queries.p',
                      './data1/test_contexts.p',
                      './data1/test_lens.p',
                      './data1/test_target.p')

In [135]:
from input_pipeline import get_iterator, FileObj
from model import Model
from utils import count_num_steps, deval, clear_dir, copy_files, eval_from_path

In [136]:
dev_graph = tf.Graph()
        
with dev_graph.as_default():
    
    dev_iterator, dev_total_num = get_iterator(regime = 'DEV',
                                              buffer_size = None,
                                              num_epochs = 1,
                                              batch_size = hparams.batch_size,
                                                filesobj = dev_files)

    dev_model = Model(hparams,
                      dev_iterator,
                      'DEV')    
    
    tf.get_default_graph().finalize()

dev_session = tf.Session(graph = dev_graph)

eval_steps = count_num_steps(1,
                            dev_total_num,
                            hparams.batch_size)

ari = eval_from_path('./best_shot/chkpts-130',
                dev_model,
                dev_session,
                dev_iterator,
                eval_steps,
                test
                )
print('score on the dev set = {}'.format(ari))

INFO:tensorflow:Restoring parameters from ./best_shot/chkpts-130


2021-02-03 16:23:48,009 : INFO : Restoring parameters from ./best_shot/chkpts-130


HBox(children=(FloatProgress(value=0.0, max=58.0), HTML(value='')))


score on the dev set = 0.15733293562616749


In [137]:
! cp -R ./data ./gdrive/MyDrive/russe/