In [1]:
import re
import random
import string
import warnings

import importlib

import customersupport.common
import customersupport.evaluation
import customersupport.evaluation.eval

print('Library versions:')

import tensorflow as tf
print('tensorflow:{}'.format(tf.__version__))
import pandas as pd
print('pandas:{}'.format(pd.__version__))
import numpy as np
print('numpy:{}'.format(np.__version__))

from IPython.display import SVG

from tqdm import tqdm_notebook as tqdm  # Special jupyter notebook progress bar

from tensorflow.python.layers import core as layers_core

from datetime import datetime

from elasticsearch import Elasticsearch
from elasticsearch import helpers

from customersupport.common.vocab import VocabHolder
from customersupport.common.dataset import CustomerSupportDataset

from customersupport.evaluation.eval import evaluate_words_index, format_metrics, get_evaluation_conf, strip_punkt

importlib.reload(customersupport.common.vocab)
importlib.reload(customersupport.common.dataset)
importlib.reload(customersupport.evaluation)
importlib.reload(customersupport.evaluation.eval)

warnings.simplefilter('ignore')

tqdm().pandas()  # Enable tracking of progress in dataframe `apply` calls

  from ._conv import register_converters as _register_converters


Library versions:
tensorflow:1.7.0
pandas:0.22.0
numpy:1.14.2





In [2]:
# 8192 - large enough for demonstration, larger values make network training slower
MAX_VOCAB_SIZE = 2**13
# seq2seq generally relies on fixed length message vectors - longer messages provide more info
# but result in slower training and larger networks
MAX_MESSAGE_LEN = 70

hparams = tf.contrib.training.HParams(
    # Larger batch sizes generally reach the average response faster, but small batch sizes are
    # required for the model to learn nuanced responses.  Also, GPU memory limits max batch size.
    batch_size=128,
    encoder_length=MAX_MESSAGE_LEN,
    decoder_length=MAX_MESSAGE_LEN,
    # Embedding size for whole messages, same trade off as word embeddings
    num_units=512,
    src_vocab_size=MAX_VOCAB_SIZE,
    # Embedding size for words - gives a trade off between expressivity of words and network size
    embedding_size=200,
    tgt_vocab_size=MAX_VOCAB_SIZE,
    # Helps regularize network and prevent overfitting.
    # High learning rate helps model reach average response faster, but can make it hard to 
    # converge on nuanced responses
    learning_rate = 0.001, #0.0005,
    max_gradient_norm = 5.0,
    beam_width = 10,
    use_attention = True,
    enc_num_layers = 2,
    dec_num_layers = 2,
    cell_type = 'bi',
    rnn_type = 'gru',
    max_epochs = 20,
    dropout = 0.2,
    use_glove = True,
    l2_reg = 0., #0005,
    decay_rate = .9,
    glove_path = '/home/momchil/Storage/Projects/Python/Data/glove.twitter.27B/glove.twitter.27B.200d.txt',
    tweets_path = '/home/momchil/Storage/Projects/Python/Data/customer-support-on-twitter/twcs-conv_ids_clean.csv',
    # Ngram count for ROUGE and BLEU
    max_order = 2,
    train_size = 0.8,
    train_time_diff = 5.,
    first_day = 0, #5#15
    last_day = 60, #33#23,
    evaluation_metrics = ["bleu", "rouge_l", "embedding_average", "vector_extrema", "greedy_matching"],
    training_metrics = ["bleu", "rouge_l", "embedding_average", "vector_extrema", "greedy_matching"],
    companies = ['AppleSupport']
)

Instructions for updating:
Use the retry module or similar alternatives.


In [3]:
%%time
cs_data = CustomerSupportDataset(hparams)

#& (y_text.str.contains('help') ^ True)
cs_data.process_utterances(['direct message'])

voc_holder = VocabHolder(hparams)
analyzer = voc_holder.fit(cs_data.x_text, cs_data.y_text, hparams.src_vocab_size)

cs_data.text_to_vec(hparams, voc_holder)
cs_data.train_test_split(hparams, do_random=False)

train_x = cs_data.x_text.iloc[list(cs_data.train_idx)].dropna()
train_y = cs_data.y_text.iloc[list(cs_data.train_idx)].dropna()

test_x = cs_data.x_text.iloc[list(cs_data.test_idx)].dropna()
test_y = cs_data.y_text.iloc[list(cs_data.test_idx)].dropna()

Done support_author (984679, 9)
Replacing anonymized screen names in X...



Replacing anonymized screen names in Y...



Fitting CountVectorizer on X and Y text data...



Number of known words 7364
Learned vocab of 8192 items.
Calculating word indexes for X...



Calculating word indexes for Y...



Training data of shape (45582, 70) and test data of shape (4044, 70).
count    45582.000000
mean         1.000000
std          1.318587
min          0.019216
25%          0.123510
50%          0.566117
75%          1.191551
max          6.313764
dtype: float64
count    4044.000000
mean        1.000000
std         0.151071
min         0.737908
25%         0.878444
50%         1.003007
75%         1.116925
max         1.249495
dtype: float64
CPU times: user 4min, sys: 3.6 s, total: 4min 3s
Wall time: 4min 2s


In [4]:
def create_doc(text_x, text_y, context, author):
    doc = {}
    doc['author'] = author
    doc['text_x'] = text_x
    doc['text_y'] = text_y
    doc['context'] = context
    doc['timestamp'] = datetime.now()

    return doc


def create_index(es):
    res = es.indices.delete(index='test-index', ignore=[400, 404])
    mapping = '''
    {  
      "mappings":{  
        "tweet":{  
          "properties":{  
            "text_x": { 
              "type": "text",
              "fields": {
                "english": { 
                  "type":     "text",
                  "analyzer": "english",
                  "tokenizer": {
                    "ngram_token": {
                      "type": "ngram",
                      "min_gram": 1,
                      "max_gram": 3,
                      "token_chars": [
                        "whitespace",
                        "punctuation"
                      ]
                    }
                  }
                }
              }
            },
            "text_y": { 
              "type": "text",
              "fields": {
                "english": { 
                  "type":     "text",
                  "analyzer": "english",
                  "tokenizer": {
                    "ngram_token": {
                      "type": "ngram",
                      "min_gram": 1,
                      "max_gram": 3,
                      "token_chars": [
                        "whitespace",
                        "punctuation"
                      ]
                    }
                }
              }
            },
            "context": { 
              "type": "text",
              "fields": {
                "english": { 
                  "type":     "text",
                  "analyzer": "english",
                  "tokenizer": {
                    "ngram_token": {
                      "type": "ngram",
                      "min_gram": 1,
                      "max_gram": 3,
                      "token_chars": [
                        "whitespace",
                        "punctuation"
                      ]
                    }
                }
              }
            },
            "author": { 
              "type": "text"
              }
            }
          }
        }
      }}'''

    es.indices.create(index='test-index', ignore=400, body=mapping)


def fill_index(es, text_x, text_y, batch=10000):
    actions = []
    for id, (tx, ty) in enumerate(zip(text_x, text_y)):
        doc = create_doc(tx, ty, '', 'AppleSupport')
        #res = es.index(index="test-index", doc_type='tweet', id=id, body=doc)

        action = {
            "_index": "test-index",
            "_type": "tweet",
            #"_id": id,
            "_source": doc
        }
        actions.append(action)
        if (len(actions) == batch):
            print("Pushed {} rows".format(len(actions)))
            helpers.bulk(es, actions)
            del actions[:]

    if (len(actions) > 0):
        helpers.bulk(es, actions)


def query_es(text_x, num_hits = 1, is_array = True, query_field = 'text_x'):
    if (is_array):
        text_x = from_word_idx(text_x)

    #res = es.search(index="test-index", body={"query": {"match": { "text_x": text_x }}, 'from': 1, 'size': 10})
    res = es.search(
        index="test-index",
        body={
            "query": {
                "bool": {
                    "should": [{
                        "match": {
                            query_field: {
                                "query": text_x,
                                "boost": 1
                            }
                        }
                    }, {
                        "match": {
                            "context": {
                                "query": '',
                                "boost": 1
                            }
                        }
                    }]
                }
            },
            'from': 0,
            'size': num_hits
        })
    
    scores = [hit['_score'] for hit in res['hits']['hits']]
    
    #idx = np.random.choice(list(range(len(scores))), p = scores / np.sum(scores))
    #res = es.search(index="test-index", body={"query": {"more_like_this": { "fields": ['text_x'] , 'like': text_x}}})
    #res = es.search(index="test-index", body={"query": {"match": { "text_x": text_x }}, 'from': 1, 'size': 10})

    return [(x['_source']['text_x'], x['_source']['text_y']) for x in res['hits']['hits']]

def query_es_bulk(texts_x, num_hits = 1, is_array = True, query_field = 'text_x'):
    if (is_array):
        texts_x = from_word_idx(texts_x)

    bodies = []
    for text_x in texts_x:
        bodies.append({"index": "test-index", "type": "tweet"},)
        bodies.append({
            "query": {
                "bool": {
                    "should": [{
                        "match": {
                            query_field: {
                                "query": text_x,
                                "boost": 5
                            }
                        }
                    }, {
                        "match": {
                            "context": {
                                "query": '',
                                "boost": 1
                            }
                        }
                    }]
                }
            },
            #'from': 0,
            'size': num_hits
        })
    res = es.msearch(bodies)
    
    return [[x['_source']['text_y'] for x in r['hits']['hits']] for r in res['responses']]

In [5]:
es = Elasticsearch()
if not es.indices.exists(index="test-index"):
    create_index(es)
    fill_index(es, train_x, train_y, 10000)

In [11]:
references = []
hypothesis = []

for i in tqdm(range(len(test_x))):
    #r = voc_holder.to_word_idx(, -1)
    ref = test_y.iloc[i]
    question = test_x.iloc[i][:1000]
    try:
        a_text = query_es(question, is_array=False)[0][1]
    except:
        a_text = ''
    
    #references.append(strip_punkt(voc_holder.to_word_idx(ref, -1), eval_conf.voc_holder.reverse_vocab))
    #hypothesis.append(strip_punkt(voc_holder.to_word_idx(a_text, -1), eval_conf.voc_holder.reverse_vocab))
    r = voc_holder.to_word_idx(ref, -1)
    h = voc_holder.to_word_idx(a_text, -1)
    references.append(r[r.nonzero()])
    hypothesis.append(h[h.nonzero()])

    
references = np.array(references)
hypothesis = np.array(hypothesis)




In [12]:
eval_conf = get_evaluation_conf(None, hparams, None, None, voc_holder)
evaluation = evaluate_words_index(references, hypothesis, eval_conf, hparams.evaluation_metrics, True)
print(format_metrics(evaluation))

BLEU@2: 13.728593680241405
Embedding Average: 81.47075795484712
Greedy Matching: 34.52206711164485
ROUGE_L: 22.34534856474629
Vector Extrema: 40.15821643147644


In [8]:
#answer = '<user> every time i restart my <version> phone i get sms notifications from years ago , then 1/2 the time the home screen stops working . backup restore and happening on my iphone x now too <url>'
#answer = 'ah so when it is apple going to update xcode with <version> support ? updated my iphone 6s and now cannot build my project for my device . thanks <user>'
answer = '<user> i have updated my <hashtag> to <hashtag> . 2 and i am not seeing the apple pay cash ! i live in us'
ir_answers = query_es(answer,
        5, False, 'text_x')
print(answer)
print()
print('\n\n'.join(map(lambda x: '>>>>> ' +x[0] + '\n<<<<< ' + x[1], ir_answers)))

<user> i have updated my <hashtag> to <hashtag> . 2 and i am not seeing the apple pay cash ! i live in us

>>>>> i have updated everything and finally caved in to pay for more storage and i still see boxes 🤦🏾‍♀️ why <user> ??? <hashtag> 😭😭 
<<<<< <user> we think this might help with that: <url>

>>>>> <user> live wallpapers are not working. i am running <version>  on my new <hashtag>. i have tried restarting and also switching to a still then switching back to a live. help, please. 
<<<<< <user> we are happy to help you get this fixed. sounds like you may have the reduce motion feature enabled. follow the steps here to locate and disable this: <url>  let us know if this helps you.

>>>>> <user> cannot get apple pay cash to set up. keep getting the 'apple pay cash unavailable' message when trying to set up. help!! 
<<<<< <user> for help with apple pay cash, connect with our specialists here: <url>

>>>>> <user> will apple pay cash work between countries? i would love to send some $$ to 