In [1]:
import re
import random
import string
import warnings

import importlib

import customersupport.common
import customersupport.evaluation
import customersupport.evaluation.eval

print('Library versions:')

import tensorflow as tf
print('tensorflow:{}'.format(tf.__version__))
import pandas as pd
print('pandas:{}'.format(pd.__version__))
import numpy as np
print('numpy:{}'.format(np.__version__))

from IPython.display import SVG

from tqdm import tqdm_notebook as tqdm  # Special jupyter notebook progress bar

from tensorflow.python.layers import core as layers_core

from datetime import datetime

from elasticsearch import Elasticsearch
from elasticsearch import helpers

from customersupport.common.vocab import VocabHolder
from customersupport.common.dataset import CustomerSupportDataset

from customersupport.evaluation.eval import evaluate_words_index, format_metrics, get_evaluation_conf, strip_punkt

importlib.reload(customersupport.common.vocab)
importlib.reload(customersupport.common.dataset)
importlib.reload(customersupport.evaluation)
importlib.reload(customersupport.evaluation.eval)

warnings.simplefilter('ignore')

tqdm().pandas()  # Enable tracking of progress in dataframe `apply` calls

  from ._conv import register_converters as _register_converters


Library versions:
tensorflow:1.7.0
pandas:0.22.0
numpy:1.14.2





In [2]:
# 8192 - large enough for demonstration, larger values make network training slower
MAX_VOCAB_SIZE = 2**14
# seq2seq generally relies on fixed length message vectors - longer messages provide more info
# but result in slower training and larger networks
MAX_MESSAGE_LEN = 70

hparams = tf.contrib.training.HParams(
    # Larger batch sizes generally reach the average response faster, but small batch sizes are
    # required for the model to learn nuanced responses.  Also, GPU memory limits max batch size.
    batch_size=128,
    encoder_length=MAX_MESSAGE_LEN,
    decoder_length=MAX_MESSAGE_LEN,
    # Embedding size for whole messages, same trade off as word embeddings
    num_units=512,
    src_vocab_size=MAX_VOCAB_SIZE,
    # Embedding size for words - gives a trade off between expressivity of words and network size
    embedding_size=200,
    tgt_vocab_size=MAX_VOCAB_SIZE,
    # Helps regularize network and prevent overfitting.
    # High learning rate helps model reach average response faster, but can make it hard to 
    # converge on nuanced responses
    learning_rate = 1e-04, #0.0005,
    max_gradient_norm = 5.0,
    beam_width = 10,
    use_attention = True,
    enc_num_layers = 2,
    dec_num_layers = 2,
    cell_type = 'bi',
    rnn_type = 'gru',
    max_epochs = 15,
    dropout = 0.2,
    use_glove = True,
    l2_reg = 0.,
    decay_rate = .9,
    glove_path = '/home/momchil/Storage/Projects/Python/Data/glove.twitter.27B/glove.twitter.27B.200d.txt',
    tweets_path = '/home/momchil/Storage/Projects/Python/Data/customer-support-on-twitter/twcs-conv_ids_clean.csv',
    # Ngram count for ROUGE and BLEU
    max_order = 2,
    train_size = 0.8,
    train_time_diff = 5.,
    first_day = 0, #5#15
    last_day = 60, #33#23,
    evaluation_metrics = ["bleu", "rouge_l", "embedding_average", "vector_extrema", "greedy_matching"],
    training_metrics = ["bleu", "rouge_l", "embedding_average", "vector_extrema", "greedy_matching"],
    companies = ['AppleSupport']
)

Instructions for updating:
Use the retry module or similar alternatives.


In [3]:
%%time
cs_data = CustomerSupportDataset(hparams)

#& (y_text.str.contains('help') ^ True)
cs_data.process_utterances(['direct message'])

Done support_author (984679, 9)
Replacing anonymized screen names in X...



Replacing anonymized screen names in Y...



CPU times: user 3min 3s, sys: 1.98 s, total: 3min 5s
Wall time: 3min 11s


In [4]:
voc_holder = VocabHolder(hparams)
analyzer = voc_holder.fit(cs_data.x_text, cs_data.y_text, hparams.src_vocab_size)

Fitting CountVectorizer on X and Y text data...



Number of known words 13796
Learned vocab of 16384 items.


In [21]:
#voc_holder.glove_weights['información']
#voc_holder.vocab['información']

#voc_holder.glove_weights[13895 - voc_holder.unk_count]
for i in tqdm(range(voc_holder.unk_count, len(voc_holder.vocab))):
    #voc_holder.vocab['información']
    assert np.array_equal(voc_holder.get_glove_weight(i), voc_holder.glove_words.loc[voc_holder.reverse_vocab[i]].values)

KeyboardInterrupt: 

In [7]:
#información -0.70397 0.18217 -0.064088 -0.0056588 0.51003 -0.10394 -0.42054 -0.34863 -0.17556 0.14012 0.80545 -0.084485 0.93872 0.029406 0.65709 -0.062524 0.049807 -0.20443 1.066 0.41751 -0.21147 -0.93464 0.10176 -0.0032734 0.16059 -2.5545 0.45048 -0.31093 -0.047943 0.19738 -0.37638 -0.34054 -0.40261 -0.22547 0.22389 0.15547 -0.48353 0.21042 -0.89683 -0.16658 -0.54625 0.084119 0.29465 0.53094 0.28825 0.24623 0.52789 0.1222 -0.78786 0.10983 0.18148 -0.17835 -0.033985 0.077592 -0.88948 0.2255 -0.035032 -0.34115 0.0091689 0.24191 0.15341 -0.084803 0.5622 0.96704 0.77437 0.84728 0.068289 -0.028839 0.50977 -0.22183 -0.25135 -0.32503 0.86078 -0.95383 -0.45049 -0.058383 0.3746 -0.54584 -0.48925 -0.28214 -0.3099 0.25639 0.46344 -0.53287 0.40488 -0.10087 -0.74409 -0.40578 0.28525 0.30872 -0.42433 0.38774 -0.13566 0.11796 0.081923 -0.2832 -0.0072533 0.19522 -0.098084 -0.5526 -0.0634 0.27517 -0.4435 -0.0083655 0.24503 -0.67493 -1.4786 -0.18219 0.22464 -0.36414 0.19137 0.080971 -0.079933 0.22319 0.97154 0.94297 -0.52906 0.96633 0.35895 0.33781 -0.46276 0.79585 0.69646 0.031073 0.39987 -0.21439 0.055769 0.49177 0.47638 -0.21456 0.42292 -0.12287 -0.012149 0.81339 0.41095 -0.44385 0.08316 -0.20046 0.66937 0.38491 0.13405 0.21944 0.17057 -0.42101 0.24932 -0.52619 0.22774 -0.47653 -1.0329 0.016041 -0.74861 -0.35379 -0.80166 0.27047 -0.69027 -0.063318 0.85249 -0.20612 -0.74476 0.031826 -0.017643 -0.22145 -0.028813 0.79223 0.63236 0.3902 -0.86664 0.29952 -0.23364 -0.48895 0.51829 0.47168 -0.26713 -0.35187 -0.42353 -0.016235 0.38586 0.25133 -0.37427 0.3312 0.39023 -0.69639 0.36957 0.0059467 -0.32109 0.6286 -0.67398 -0.1552 0.085883 0.33042 0.42234 0.17393 -0.46704 -0.18503 1.0627 0.41987 -0.36343 -0.11899 -0.45553 0.6923
# v.glove_weights[v.vocab['información']]

In [8]:
cs_data.text_to_vec(hparams, voc_holder)
cs_data.train_test_split(hparams, do_random=False)

train_x = cs_data.x_text.iloc[list(cs_data.train_idx)].dropna()
train_y = cs_data.y_text.iloc[list(cs_data.train_idx)].dropna()

test_x = cs_data.x_text.iloc[list(cs_data.test_idx)].dropna()
test_y = cs_data.y_text.iloc[list(cs_data.test_idx)].dropna()

Calculating word indexes for X...



Calculating word indexes for Y...



Training data of shape (45582, 70) and test data of shape (4044, 70).
count    45582.000000
mean         1.000000
std          1.318587
min          0.019216
25%          0.123510
50%          0.566117
75%          1.191551
max          6.313764
dtype: float64
count    4044.000000
mean        1.000000
std         0.151071
min         0.737908
25%         0.878444
50%         1.003007
75%         1.116925
max         1.249495
dtype: float64


In [9]:
def create_doc(text_x, text_y, context, author):
    doc = {}
    doc['author'] = author
    doc['text_x'] = text_x
    doc['text_y'] = text_y
    doc['context'] = context
    doc['timestamp'] = datetime.now()

    return doc


def create_index(es):
    res = es.indices.delete(index='test-index', ignore=[400, 404])
    mapping = '''
    {  
      "mappings":{  
        "tweet":{  
          "properties":{  
            "text_x": { 
              "type": "text",
              "fields": {
                "english": { 
                  "type":     "text",
                  "analyzer": "english",
                  "tokenizer": {
                    "ngram_token": {
                      "type": "ngram",
                      "min_gram": 1,
                      "max_gram": 3,
                      "token_chars": [
                        "whitespace",
                        "punctuation"
                      ]
                    }
                  }
                }
              }
            },
            "text_y": { 
              "type": "text",
              "fields": {
                "english": { 
                  "type":     "text",
                  "analyzer": "english",
                  "tokenizer": {
                    "ngram_token": {
                      "type": "ngram",
                      "min_gram": 1,
                      "max_gram": 3,
                      "token_chars": [
                        "whitespace",
                        "punctuation"
                      ]
                    }
                }
              }
            },
            "context": { 
              "type": "text",
              "fields": {
                "english": { 
                  "type":     "text",
                  "analyzer": "english",
                  "tokenizer": {
                    "ngram_token": {
                      "type": "ngram",
                      "min_gram": 1,
                      "max_gram": 3,
                      "token_chars": [
                        "whitespace",
                        "punctuation"
                      ]
                    }
                }
              }
            },
            "author": { 
              "type": "text"
              }
            }
          }
        }
      }}'''

    es.indices.create(index='test-index', ignore=400, body=mapping)


def fill_index(es, text_x, text_y, batch=10000):
    actions = []
    for id, (tx, ty) in enumerate(zip(text_x, text_y)):
        doc = create_doc(tx, ty, '', 'AppleSupport')
        #res = es.index(index="test-index", doc_type='tweet', id=id, body=doc)

        action = {
            "_index": "test-index",
            "_type": "tweet",
            #"_id": id,
            "_source": doc
        }
        actions.append(action)
        if (len(actions) == batch):
            print("Pushed {} rows".format(len(actions)))
            helpers.bulk(es, actions)
            del actions[:]

    if (len(actions) > 0):
        helpers.bulk(es, actions)


def query_es(text_x, num_hits = 1, is_array = True, query_field = 'text_x'):
    if (is_array):
        text_x = from_word_idx(text_x)

    #res = es.search(index="test-index", body={"query": {"match": { "text_x": text_x }}, 'from': 1, 'size': 10})
    res = es.search(
        index="test-index",
        body={
            "query": {
                "bool": {
                    "should": [{
                        "match": {
                            query_field: {
                                "query": text_x,
                                "boost": 1
                            }
                        }
                    }, {
                        "match": {
                            "context": {
                                "query": '',
                                "boost": 1
                            }
                        }
                    }]
                }
            },
            'from': 0,
            'size': num_hits
        })
    
    scores = [hit['_score'] for hit in res['hits']['hits']]
    
    #idx = np.random.choice(list(range(len(scores))), p = scores / np.sum(scores))
    #res = es.search(index="test-index", body={"query": {"more_like_this": { "fields": ['text_x'] , 'like': text_x}}})
    #res = es.search(index="test-index", body={"query": {"match": { "text_x": text_x }}, 'from': 1, 'size': 10})

    return [(x['_source']['text_x'], x['_source']['text_y']) for x in res['hits']['hits']]

def query_es_bulk(texts_x, num_hits = 1, is_array = True, query_field = 'text_x'):
    if (is_array):
        texts_x = from_word_idx(texts_x)

    bodies = []
    for text_x in texts_x:
        bodies.append({"index": "test-index", "type": "tweet"},)
        bodies.append({
            "query": {
                "bool": {
                    "should": [{
                        "match": {
                            query_field: {
                                "query": text_x,
                                "boost": 5
                            }
                        }
                    }, {
                        "match": {
                            "context": {
                                "query": '',
                                "boost": 1
                            }
                        }
                    }]
                }
            },
            #'from': 0,
            'size': num_hits
        })
    res = es.msearch(bodies)
    
    return [[x['_source']['text_y'] for x in r['hits']['hits']] for r in res['responses']]

In [10]:
es = Elasticsearch()
if not es.indices.exists(index="test-index"):
    create_index(es)
    fill_index(es, train_x, train_y, 10000)

In [11]:
references = []
hypothesis = []

for i in tqdm(range(len(test_x))):
    #r = voc_holder.to_word_idx(, -1)
    ref = test_y.iloc[i]
    question = test_x.iloc[i][:1000]
    try:
        a_text = query_es(question, is_array=False)[0][1]
    except:
        a_text = ''
    
    #references.append(strip_punkt(voc_holder.to_word_idx(ref, -1), eval_conf.voc_holder.reverse_vocab))
    #hypothesis.append(strip_punkt(voc_holder.to_word_idx(a_text, -1), eval_conf.voc_holder.reverse_vocab))
    r = voc_holder.to_word_idx(ref, -1)
    h = voc_holder.to_word_idx(a_text, -1)
    references.append(r[r.nonzero()])
    hypothesis.append(h[h.nonzero()])

    
references = np.array(references)
hypothesis = np.array(hypothesis)




In [16]:
eval_conf = get_evaluation_conf(None, hparams, None, None, voc_holder)
evaluation = evaluate_words_index(references, hypothesis, eval_conf, hparams.evaluation_metrics, True)
print(format_metrics(evaluation))

BLEU@2: 13.732301176466532
Embedding Average: 96.8846297555305
Greedy Matching: 61.66439341050898
ROUGE_L: 22.347860605005877
Vector Extrema: 52.44791511076616


In [13]:
eval_conf = get_evaluation_conf(None, hparams, None, None, voc_holder)
r = [references[1], references[200], references[3]]
h = [hypothesis[1], hypothesis[200], hypothesis[3]]
evaluation = evaluate_words_index(r, h, eval_conf, ["embedding_average"], True)
print(format_metrics(evaluation))

r = [references[200], references[1], references[3]]
h = [hypothesis[200], hypothesis[1], hypothesis[3]]
evaluation = evaluate_words_index(r, h, eval_conf, ["embedding_average"], True)
print(format_metrics(evaluation))

r = [references[3], references[200], references[1]]
h = [hypothesis[3], hypothesis[200], hypothesis[1]]

evaluation = evaluate_words_index(r, h, eval_conf, ["embedding_average"], True)
print(format_metrics(evaluation))

Embedding Average: 97.53525121210514
Embedding Average: 97.53525121210514
Embedding Average: 97.53525121210514


In [14]:
stop

NameError: name 'stop' is not defined

In [None]:
eval_conf = get_evaluation_conf(None, hparams, None, None, voc_holder)
evaluation = evaluate_words_index(references, hypothesis, eval_conf, ["embedding_average"], True)
print(format_metrics(evaluation))

In [None]:
eval_conf = get_evaluation_conf(None, hparams, None, None, voc_holder)
r = [references[1], references[200], references[3]]
h = [hypothesis[1], hypothesis[200], hypothesis[3]]
evaluation = evaluate_words_index(r, h, eval_conf, ["embedding_average"], True)
print(format_metrics(evaluation))

r = [references[200], references[1], references[3]]
h = [hypothesis[200], hypothesis[1], hypothesis[3]]
evaluation = evaluate_words_index(r, h, eval_conf, ["embedding_average"], True)
print(format_metrics(evaluation))

r = [references[3], references[200], references[1]]
h = [hypothesis[3], hypothesis[200], hypothesis[1]]

evaluation = evaluate_words_index(r, h, eval_conf, ["embedding_average"], True)
print(format_metrics(evaluation))

In [None]:
#answer = '<user> every time i restart my <version> phone i get sms notifications from years ago , then 1/2 the time the home screen stops working . backup restore and happening on my iphone x now too <url>'
#answer = 'ah so when it is apple going to update xcode with <version> support ? updated my iphone 6s and now cannot build my project for my device . thanks <user>'
answer = '<user> i have updated my <hashtag> to <hashtag> . 2 and i am not seeing the apple pay cash ! i live in us'
ir_answers = query_es(answer,
        5, False, 'text_x')
print(answer)
print()
print('\n\n'.join(map(lambda x: '>>>>> ' +x[0] + '\n<<<<< ' + x[1], ir_answers)))