In [35]:
import warnings
warnings.filterwarnings('ignore')
import numpy as np
import datetime
from collections import Counter
import pandas as pd
from url_parsing import simplify_url
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from train_utils import build_stream_df, build_train_test_data

from recommender import Recommender

import matplotlib.pyplot as plt
%matplotlib inline

## Data Prep
The first step in data preparation is to convert a dataframe of individual clicks to a dataframe with lists of sequential clicks by user. This dataframe will be called 'stream_df'. The urls are then encoded to numerical indices and then into one-hot representations.

One of the main challenges with making recommendations for the intent of an online customer is the lack of labelled data. As a proxy of the intent we will use clicks on FAQ pages. Clearly, if a user accessed a page "how to change my billing address" then the intent behind the online journey is likely to change the billing address.

Most observed journeys do not contain clicks on FAQ pages. In the following we will downsample the dataset to only include journeys containing such pages. 

In [28]:
data_file = 'data/click_call_balanced.csv'
UNIQUE_ID = 'UCRN'
n_datapoints = None

df = pd.read_csv(data_file, sep=',')
df['date_time'] = df['date_time'].apply(lambda x: datetime.datetime.fromtimestamp(x))
df = df[['pagename', 'date_time', UNIQUE_ID, 'has_call']]

simple_urls = [simplify_url(page) for page in df['pagename']]
df.loc[:, 'pagename'] = simple_urls

stream_df = build_stream_df(df, UNIQUE_ID)

all_urls = np.unique(np.concatenate(stream_df['url_sequence'].values))
l_enc = LabelEncoder()    
enc_urls = l_enc.fit(all_urls)

stream_df['encoded_sequence'] = [l_enc.transform(x[:-1]) for x in stream_df['url_sequence']]

oh_enc = OneHotEncoder(sparse=False, handle_unknown='ignore')
oh_enc.fit(np.sort(np.unique(np.concatenate(stream_df['encoded_sequence'].values))).reshape(-1, 1))

stream_df['oh_encoded_sequence'] = [oh_enc.transform(x.reshape(-1, 1)) for x in stream_df['encoded_sequence']]

journey = stream_df['url_sequence'].values[0]
rec = Recommender()
recs = np.array([rec.recommend(journey) for journey in stream_df['url_sequence'].values])
keep_idxs = np.where((recs!='unknown') & (recs!='cycle longer than 40 clicks') & (recs!='over 40 clicks'))[0]

observations = stream_df['oh_encoded_sequence'].iloc[keep_idxs].values
journeys = np.array([np.sum(obs,axis=0) for obs in observations])
labels = recs[keep_idxs]


## HMMLearn Approach
In this approach we will use a Hidden Markov model where the hidden state is the thought to be the intent and the observations are the encoded urls.
We conduct experiments with Gaussian emission probabilities and multinomial emission probabilities.

In [39]:
from hmmlearn import hmm
from sklearn.decomposition import TruncatedSVD
from sklearn.model_selection import train_test_split

# get a dense representation of the observed journeys using SVD for the Gaussian HMM
svd = TruncatedSVD(n_components=2)
svd.fit(journeys)
observations_vecs = [svd.transform(obs) for obs in observations]

# encode the FAQ labels
intent_enc = LabelEncoder()
enc_labels = intent_enc.fit_transform(labels)

# idea 1: use a HMM with gaussian emmissions on the dense observations
observations_vecs_train, observations_vecs_test, vecs_y_train, vecs_y_test = train_test_split(
    observations_vecs, enc_labels, test_size=0.33, random_state=42)

vecs_lengths_train = [len(obs) for obs in observations_vecs_train]
vecs_lengths_test = [len(obs) for obs in observations_vecs_test]

gaussian_model = hmm.GaussianHMM(n_components=10, covariance_type="diag", n_iter=1000, params='se', init_params='se')
gaussian_model.transmat_ = np.diag(np.ones(n_components))
gaussian_model.fit(np.concatenate(observations_vecs_train), vecs_lengths_train)


GaussianHMM(algorithm='viterbi', covariance_type='diag', covars_prior=0.01,
      covars_weight=1, init_params='se', means_prior=0, means_weight=0,
      min_covar=0.001, n_components=10, n_iter=1000, params='se',
      random_state=None, startprob_prior=1.0, tol=0.01, transmat_prior=1.0,
      verbose=False)

In [33]:
# idea 2: use a HMM with multinomial emissions on the one-hot encoded observations
n_components = 10
observations_resh = [o[:, np.newaxis] for o in stream_df['encoded_sequence'][keep_idxs].values]

transmat_prior = np.diag(np.ones(n_components))    
multinomial_model = hmm.MultinomialHMM(n_components=n_components, n_iter=100, transmat_prior=transmat_prior, 
                           verbose=True, params='se', init_params='se')
n_features = np.max(np.concatenate(observations_resh))
start_probability = np.ones(n_features)/float(n_features)

# no transitions of the state
transition_probability = np.diag(np.ones(n_components))
# uniform prior
emission_probability = np.ones((n_components, n_features))/n_features

#gaussian_model.startprob_ = start_probability
multinomial_model.transmat_ = transition_probability

# when fitting the HMM we need at least one observation of every state
#reversed_observations = [obs[::-1] for obs in observations]
X_train, X_test, y_train, y_test = train_test_split(observations_resh, enc_labels, test_size=0.33, random_state=42)

train_lengths = [len(obs) for obs in X_train]
# have to add this to avoid exception
dummy = np.arange(len(l_enc.classes_))[:,np.newaxis]
train_lengths += [len(dummy)]
multinomial_model.fit(np.concatenate([np.concatenate(X_train), dummy]), train_lengths)



         1     -378360.7910             +nan
         2     -244626.2556     +133734.5355
         3     -241724.0121       +2902.2435
         4     -240685.2459       +1038.7662
         5     -240258.2226        +427.0233
         6     -240010.5880        +247.6346
         7     -239909.1195        +101.4685
         8     -239842.1626         +66.9569
         9     -239804.3763         +37.7863
        10     -239775.4390         +28.9372
        11     -239739.8892         +35.5498
        12     -239733.2165          +6.6728
        13     -239725.8385          +7.3779
        14     -239722.9506          +2.8879
        15     -239686.9594         +35.9913
        16     -239663.1820         +23.7774
        17     -239646.2716         +16.9103
        18     -239642.6430          +3.6286
        19     -239638.8998          +3.7432
        20     -239633.6328          +5.2670
        21     -239631.6068          +2.0260
        22     -239630.1493          +1.4575
        23

MultinomialHMM(algorithm='viterbi', init_params='se', n_components=10,
        n_iter=100, params='se',
        random_state=<mtrand.RandomState object at 0x107e736e0>,
        startprob_prior=1.0, tol=0.01,
        transmat_prior=array([[ 1.,  0., ...,  0.,  0.],
       [ 0.,  1., ...,  0.,  0.],
       ...,
       [ 0.,  0., ...,  1.,  0.],
       [ 0.,  0., ...,  0.,  1.]]),
        verbose=True)

In [43]:
def accuracy(y_true, preds):
    score=0
    for i in range(len(y_true)):
        if y_true[i] in preds[i]:
            score+=1
    score /= float(len(preds))
    return score

preds_train_gaussian = [gaussian_model.predict(obs) for obs in observations_vecs]
preds_test_gaussian = [gaussian_model.predict(obs) for obs in observations_vecs]

preds_train_multi = [multinomial_model.predict(obs)[-1] for i,obs in enumerate(X_train)]
preds_test_multi = [multinomial_model.predict(obs)[-1] for i,obs in enumerate(X_test)]

print("Gaussian HMM Train Acc: {}").format(accuracy(y_train, [x for x in preds_train_gaussian]))
print("Gaussian HMM Test Acc: {}").format(accuracy(y_test, [x for x in preds_test_gaussian]))
print("Multinomial HMM Train Acc: {}").format(accuracy(y_train, [[x] for x in preds_train_multi]))
print("Multinomial HMM Test Acc: {}").format(accuracy(y_test, [[x] for x in preds_test_multi]))

Gaussian HMM Train Acc: 0.00822078684674
Gaussian HMM Test Acc: 0.00822078684674
Multinomial HMM Train Acc: 0.00788781770377
Multinomial HMM Test Acc: 0.00711743772242


## Use Bayes Rule
This is an extension of the HMM. What we can do is use a simple HMM with few hidden states (e.g. 2) and then use Bayes Rule to infer the FAQ given the hidden state. This way many different journeys can be represented as the same state giving a higher level representation. In the following 'intent' will denote the hidden state of the HMM and 'faq' will denote the prediction of the model.

In [50]:
# infer probability table for p(faq|intent) using bayes rule
faqs = np.unique(enc_labels)
n_faqs = len(faqs)
prob_matrix = np.zeros((n_faqs, n_components))
for faq,intent in zip(enc_labels, preds_train_multi):
    prob_matrix[faq,intent] += 1
prob_matrix = prob_matrix / np.sum(prob_matrix, axis=1)[:, np.newaxis]
prob_matrix[np.sum(np.isnan(prob_matrix), axis=1)>1] = 1./prob_matrix.shape[1]
p_faq = Counter(y_train)
for k in p_faq.keys():
    p_faq[k] /= float(len(y_train))

# P(intent)
p_intent = Counter(preds_train)
for k in p_intent.keys():
    p_intent[k] /= float(len(preds_train))

# P(FAQ | intent)
def p_faq_intent(faq, intent):
    return prob_matrix[faq,intent] * p_faq[faq] / p_intent[intent]

# P(FAQ)
def get_faq(intent, top_n=1):
    return np.argsort([p_faq_intent(f, intent) for f in faqs])[-top_n:]


In [49]:
print("Bayes Rule Train Acc: {}").format(accuracy(y_train, [get_faq(pred, 3) for pred in preds_train]))
print("Bayes Rule Test Acc: {}").format(accuracy(y_test, [get_faq(pred, 3) for pred in preds_test]))
#np.sum(enc_labels==pred_faq)

# do a few tests
idx = 0
label = labels[0]
journey = stream_df['url_sequence'].iloc[keep_idxs].values[idx]
enc_journey = stream_df['encoded_sequence'].iloc[keep_idxs].values[idx]
pred = multinomial_model.predict(enc_journey.reshape(-1,1))[-1]
faq = get_faq(pred)
[intent_enc.inverse_transform(f) for f in faq]

Bayes Rule Train Acc: 0.336546888694
Bayes Rule Test Acc: 0.377224199288


['how can you help me with my gas or boiler annual service?']