In [15]:
import importlib
import os, sys
import dataloader as dd
from keras.optimizers import *
from keras.callbacks import *

importlib.reload(dd)

itokens, otokens = dd.MakeS2SDict('en2de.s2s.txt', dict_file='en2de_word.txt', min_freq=4)
Xtrain, Ytrain = dd.MakeS2SData(
    'en2de.s2s.txt', itokens, otokens, h5_file='en2de.h5', max_len=60)
Xvalid, Yvalid = dd.MakeS2SData(
    'en2de.s2s.valid.txt', itokens, otokens, h5_file='en2de.valid.h5', max_len=60)

print('seq 1 words:', itokens.num())
print('seq 2 words:', otokens.num())
print('train shapes:', Xtrain.shape, Ytrain.shape)
print('valid shapes:', Xvalid.shape, Yvalid.shape)

'''
from rnn_s2s import RNNSeq2Seq
s2s = RNNSeq2Seq(itokens,otokens, 256)
s2s.compile('rmsprop')
s2s.model.fit([Xtrain, Ytrain], None, batch_size=64, epochs=30, validation_data=([Xvalid, Yvalid], None))
'''

from transformer import Transformer, LRSchedulerPerStep, LRSchedulerPerEpoch

iters_per_epoch = int(np.ceil(Xtrain.shape[0] / 64))

l_rate = 0.001
class LRSchedulerExponentialDecay(Callback):
    def __init__(self, d_model, warmup=4000):
        self.step_num = 0
        
    def on_batch_begin(self, batch, logs = None):
        self.step_num += 1
        lr =  l_rate * (0.99 ** (self.step_num // iters_per_epoch))
        K.set_value(self.model.optimizer.lr, lr)

# d_model = 256
# s2s = Transformer(itokens, otokens, len_limit=300, d_model=d_model, d_inner_hid=512, \
#        n_head=8, d_k=64, d_v=64, layers=6, dropout=0.1)

d_model = 512
s2s = Transformer(itokens, otokens, len_limit=70, d_model=d_model, d_inner_hid=1024, \
         n_head=6, d_k=64, d_v=64, layers=4, dropout=0.1)

lr_scheduler = LRSchedulerPerStep(
     d_model, 4000)  # there is a warning that it is slow, however, it's ok.
#lr_scheduler = LRSchedulerExponentialDecay(d_model, 4000)  # this scheduler only update lr per epoch
model_saver = ModelCheckpoint(
    'en2de.model.h5', save_best_only=True, save_weights_only=True)

s2s.compile(Adam(l_rate , 0.9, 0.999, epsilon=1e-8))
s2s.model.summary()
try:
    s2s.model.load_weights('en2de.model.h5')
except:
    print('\n\nnew model')

loading en2de_word.txt
loading en2de.h5
loading en2de.valid.h5
seq 1 words: 7943
seq 2 words: 2593
train shapes: (45582, 60) (45582, 60)
valid shapes: (4044, 60) (4044, 60)
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_11 (InputLayer)           (None, None)         0                                            
__________________________________________________________________________________________________
lambda_489 (Lambda)             (None, None)         0           input_11[0][0]                   
__________________________________________________________________________________________________
input_10 (InputLayer)           (None, None)         0                                            
__________________________________________________________________________________________________
lambda_492 (Lambda)             (No

In [16]:
s2s.model.fit([Xtrain, Ytrain], None, batch_size=64, epochs=15, \
    validation_data=([Xvalid, Yvalid], None), \
    callbacks=[lr_scheduler, model_saver])

Train on 45582 samples, validate on 4044 samples
Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


<keras.callbacks.History at 0x7f50435da908>

In [3]:
import re
import random
import string
import warnings

import importlib

import customersupport.common
import customersupport.evaluation
import customersupport.evaluation.eval

print('Library versions:')

import tensorflow as tf
print('tensorflow:{}'.format(tf.__version__))
import pandas as pd
print('pandas:{}'.format(pd.__version__))
import numpy as np
print('numpy:{}'.format(np.__version__))

from IPython.display import SVG

from tqdm import tqdm_notebook as tqdm  # Special jupyter notebook progress bar

from tensorflow.python.layers import core as layers_core

from datetime import datetime

from elasticsearch import Elasticsearch
from elasticsearch import helpers

from customersupport.common.vocab import VocabHolder
from customersupport.common.dataset import CustomerSupportDataset

from customersupport.evaluation.eval import *#evaluate_words_index, format_metrics, get_evaluation_conf, strip_punkt
import customersupport.common.utils

importlib.reload(customersupport.common.vocab)
importlib.reload(customersupport.common.dataset)
importlib.reload(customersupport.evaluation)
importlib.reload(customersupport.evaluation.eval)
importlib.reload(customersupport.common.utils)

warnings.simplefilter('ignore')

tqdm().pandas()  # Enable tracking of progress in dataframe `apply` calls

Library versions:
tensorflow:1.7.0
pandas:0.22.0
numpy:1.14.2





In [4]:
# 8192 - large enough for demonstration, larger values make network training slower
MAX_VOCAB_SIZE = 2**14

# seq2seq generally relies on fixed length message vectors - longer messages provide more info
# but result in slower training and larger networks
#MAX_MESSAGE_LEN = 50

hparams = tf.contrib.training.HParams(
    # Larger batch sizes generally reach the average response faster, but small batch sizes are
    # required for the model to learn nuanced responses.  Also, GPU memory limits max batch size.
    batch_size=128,
    encoder_length=60,
    decoder_length=60,
    # Embedding size for whole messages, same trade off as word embeddings
    num_units=256,
    src_vocab_size=MAX_VOCAB_SIZE,
    # Embedding size for words - gives a trade off between expressivity of words and network size
    embedding_size=200,
    tgt_vocab_size=MAX_VOCAB_SIZE,
    # Helps regularize network and prevent overfitting.
    # High learning rate helps model reach average response faster, but can make it hard to
    # converge on nuanced responses
    learning_rate=1e-03,  #0.0005,
    max_gradient_norm=5.0,
    beam_width=10,
    use_attention=True,
    enc_num_layers=2,
    dec_num_layers=2,
    cell_type='bi',
    rnn_type='gru',
    attention_architecture='gnmt',
    max_epochs=30,
    dropout=0.2,
    use_glove=True,
    l2_reg=0.,
    glove_path=
    '/home/momchil/Storage/Projects/Python/Data/glove.twitter.27B/glove.twitter.27B.200d.txt',
    tweets_path=
    '/home/momchil/Storage/Projects/Python/Data/customer-support-on-twitter/twcs-conv_ids_clean.csv',
    # Ngram count for ROUGE and BLEU
    max_order=2,
    train_size=0.8,
    decay_rate=0.99,
    train_time_diff=5.0,
    first_day=0,
    last_day=60,
    evaluation_metrics=[
        "bleu", "rouge_l", "embedding_average", "vector_extrema",
        "greedy_matching"
    ],
    training_metrics=[
        "bleu", "rouge_l", "embedding_average", "vector_extrema",
        "greedy_matching"
    ],
    companies=['AppleSupport'])

Instructions for updating:
Use the retry module or similar alternatives.


In [5]:
%%time
cs_data = CustomerSupportDataset(hparams)

#& (y_text.str.contains('help') ^ True)
cs_data.process_utterances(['direct message'])

Done support_author (984679, 9)
Replacing anonymized screen names in X...



Replacing anonymized screen names in Y...



CPU times: user 3min 8s, sys: 923 ms, total: 3min 9s
Wall time: 3min 7s


In [6]:
voc_holder = VocabHolder(hparams)
analyzer = voc_holder.fit(cs_data.x_text, cs_data.y_text, hparams.src_vocab_size)

cs_data.text_to_vec(hparams, voc_holder)
cs_data.train_test_split(hparams, do_random=False)

train_x = cs_data.x_text.iloc[list(cs_data.train_idx)].dropna()
train_y = cs_data.y_text.iloc[list(cs_data.train_idx)].dropna()

test_x = cs_data.x_text.iloc[list(cs_data.test_idx)].dropna()
test_y = cs_data.y_text.iloc[list(cs_data.test_idx)].dropna()

Loaded glove
Loaded w2v
Fitting CountVectorizer on X and Y text data...



Number of known words 13794
Learned vocab of 16384 items.
Calculating word indexes for X...



Calculating word indexes for Y...



Training data of shape (45582, 60) and test data of shape (4044, 60).
count    45582.000000
mean         1.000000
std          0.141677
min          0.740038
25%          0.883758
50%          1.021893
75%          1.097074
max          1.286219
dtype: float64
count    4044.000000
mean        1.000000
std         0.014701
min         0.972407
25%         0.988713
50%         1.001299
75%         1.011627
max         1.022508
dtype: float64


In [25]:
references = []
hypothesis = []

for i in tqdm(range(len(test_x))):
    ref = test_y.iloc[i]
    question = customersupport.common.utils.tweet_tokenize(test_x.iloc[i])
    #voc_holder.from_word_idx(voc_holder.to_word_idx(, -1))[:297]
    try:
        a_text = s2s.beam_search(question, 3, delimiter=' ')[0][0]
    except Exception as e: 
        print(e)
        a_text = ''

    #references.append(strip_punkt(voc_holder.to_word_idx(ref, -1), eval_conf.voc_holder.reverse_vocab))
    #hypothesis.append(strip_punkt(voc_holder.to_word_idx(a_text, -1), eval_conf.voc_holder.reverse_vocab))
    
    r = voc_holder.to_word_idx(ref, -1)
    h = voc_holder.to_word_idx(a_text, -1)
    references.append(r[r.nonzero()])
    hypothesis.append(h[h.nonzero()])

    
references = np.array(references)
hypothesis = np.array(hypothesis)

OOM when allocating tensor with shape[18,7098,7098] and type float on /job:localhost/replica:0/task:0/device:GPU:0 by allocator GPU_0_bfc
	 [[Node: lambda_604/MatMul = BatchMatMul[T=DT_FLOAT, adj_x=false, adj_y=true, _device="/job:localhost/replica:0/task:0/device:GPU:0"](lambda_600/Reshape_1, lambda_601/Reshape_1)]]
Hint: If you want to see a list of allocated tensors when OOM happens, add report_tensor_allocations_upon_oom to RunOptions for current allocation info.

	 [[Node: layer_normalization_78_1/add_1/_6913 = _Recv[client_terminated=false, recv_device="/job:localhost/replica:0/task:0/device:CPU:0", send_device="/job:localhost/replica:0/task:0/device:GPU:0", send_device_incarnation=1, tensor_name="edge_1298_layer_normalization_78_1/add_1", tensor_type=DT_FLOAT, _device="/job:localhost/replica:0/task:0/device:CPU:0"]()]]
Hint: If you want to see a list of allocated tensors when OOM happens, add report_tensor_allocations_upon_oom to RunOptions for current allocation info.


Caused 

OOM when allocating tensor with shape[18,7235,7235] and type float on /job:localhost/replica:0/task:0/device:GPU:0 by allocator GPU_0_bfc
	 [[Node: lambda_604/MatMul = BatchMatMul[T=DT_FLOAT, adj_x=false, adj_y=true, _device="/job:localhost/replica:0/task:0/device:GPU:0"](lambda_600/Reshape_1, lambda_601/Reshape_1)]]
Hint: If you want to see a list of allocated tensors when OOM happens, add report_tensor_allocations_upon_oom to RunOptions for current allocation info.

	 [[Node: layer_normalization_78_1/add_1/_6913 = _Recv[client_terminated=false, recv_device="/job:localhost/replica:0/task:0/device:CPU:0", send_device="/job:localhost/replica:0/task:0/device:GPU:0", send_device_incarnation=1, tensor_name="edge_1298_layer_normalization_78_1/add_1", tensor_type=DT_FLOAT, _device="/job:localhost/replica:0/task:0/device:CPU:0"]()]]
Hint: If you want to see a list of allocated tensors when OOM happens, add report_tensor_allocations_upon_oom to RunOptions for current allocation info.


Caused 




In [26]:
references = np.array(references)
hypothesis = np.array(hypothesis)

In [35]:
for i, r in enumerate(hypothesis):
    if (len(r) == 0):
        print(i)

2898
2901
2959
2962


In [49]:
hypothesis[[2898, 2901, 2959, 2962]]

array([list([0]), list([0]), list([0]), list([0])], dtype=object)

In [50]:
eval_conf = get_evaluation_conf(None, hparams, None, None, voc_holder)
evaluation = evaluate_words_index(references, hypothesis, eval_conf, hparams.evaluation_metrics, True)
print(format_metrics(evaluation))

BLEU@2: 12.517067570165455
Embedding Average: 76.93455490984044
Greedy Matching: 30.129827156763795
ROUGE_L: 22.763979193134194
Vector Extrema: 37.125948048069475


In [32]:
df = pd.DataFrame({'Question': test_x, 'Reference': test_y, 'Hypothesis':list(map(voc_holder.from_word_idx, hypothesis))}, columns = ["Question", "Reference", "Hypothesis"])

df.to_csv('/home/momchil/Desktop/transformer_all_dict.tsv', sep='\t', encoding='utf-8')