<a href="https://colab.research.google.com/github/mns0/Relix/blob/main/DataModeling.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import numpy as np
import os
import glob
import re
import pickle
#from hmmlearn import hmm
from pomegranate import *


In [None]:
pickle_file_name_open = "minimally_processed_trajectories.pkl"
with open(pickle_file_name_open, 'rb') as f:
    data = pickle.load(f)

In [None]:
#Determine the features 
#Click State Features 
click_event_states = set()
write_event_states = set()
for traj in data:
    flag = False
    for event in traj:
        #filter out CANT FIND MATCH and NoMatch keywords ()
        if  len(event) > 0 and event[0] == 'write':
            write_event_states.add(event[1])
        elif 0 < len(event) < 3 and event[0] == 'click' and\
            "CityNameWithZipCode" not in event[1] and not event[1][1:4].isdigit():
            click_event_states.add(event[1])
        elif 0 < len(event) < 3 and event[0] == 'click' and\
            "CityNameWithZipCode" not in event[1] and event[1][1:4].isdigit():
            click_event_states.add("Address")
        elif 0 < len(event) < 3 and event[0] == 'click' and\
            "CityNameWithZipCode":
            #for CityNameWithZipCode click events
            click_event_states.add("CityNameWithZipCode")
        else:
            pass


feature_vector = ['goto'] + list(click_event_states) + list(write_event_states)
feature_vector_to_idx = {feature : i for i, feature in enumerate(feature_vector)}
idx_to_feature_vector = {v: k for k, v in feature_vector_to_idx.items()}

In [None]:
#create the transition matrix
#for markov models
num_features = len(feature_vector)
outdegree = np.zeros(num_features)
transition_matrix = np.zeros((num_features,num_features))
initial_transition_vector = np.zeros(num_features)

def get_idx(line):
    ret_idx = -1
    if line[0] == 'goto':
        ret_idx = feature_vector_to_idx[line[0]]
    elif line[0] == 'write':
        ret_idx = feature_vector_to_idx[line[1]]
    elif line[0] == 'click':
        if "CityNameWithZipCode" not in line[1] and not line[1][1:4].isdigit():
            ret_idx = feature_vector_to_idx[line[1]]
        elif  "CityNameWithZipCode" in line[1]:
            ret_idx = feature_vector_to_idx["CityNameWithZipCode"]
        elif line[1][1:4].isdigit():
            ret_idx = feature_vector_to_idx["Address"]
    return ret_idx
            

for traj in data:
    for i in range(1,len(traj)):
        parent = get_idx(traj[i-1]) #row
        child = get_idx(traj[i])    #col 
        transition_matrix[parent][child] += 1
        outdegree[parent] += 1
        
        
#normalize the transition matrix
for row_idx in range(transition_matrix.shape[0]):
    a = transition_matrix[row_idx]
    b = outdegree[row_idx]
    transition_matrix[row_idx] = np.divide(a, b, out=np.zeros_like(a), where=b!=0)
    

In [None]:
#####creating dataset for hhm
sequences = []
sequences_raw = []

sequences_len = []
for traj in data:
    L = len(traj)
    traj_sequence = []
    for i in range(len(traj)):
        state = get_idx(traj[i])
        traj_sequence.append(state)
    sequences.append(traj_sequence)
    sequences_len.append(L)
    
X = np.concatenate(sequences)
#model = hmm.GaussianHMM(n_components=32).fit(X, sequences_len)


In [None]:
#simple MC model 
#Generate 100 samples and rank according to likilihood

mc_model = MarkovChain.from_samples(sequences)

num_of_samples = 100
seen = set()
ranked_generated_sequences = []
i=0
while i < num_of_samples:
    sample = mc_model.sample(mc_model.distributions[1])
    tup_sam = tuple(sample)
    if tup_sam not in seen:
        seen.add(tup_sam)
        log_prob = mc_model.log_probability(sample)
        #sample = [idx_to_feature_vector(i) for i in sample]
        sample = [idx_to_feature_vector[i] for i in sample]
        ranked_generated_sequences.append([np.exp(log_prob),sample])
        i+=1

    
ranked_generated_sequences.sort(reverse=True, key=lambda x : x[0],)
ranked_generated_sequences[0:100]

[[0.6105805335941531,
  ['goto',
   "'See today\\'s rates'",
   "'See rates', near('First, are you buying?')"]],
 [0.21596141197105895, ['goto', "'Our Services'", "'About Us'"]],
 [0.1734580544347879,
  ['goto',
   "'See today\\'s rates'",
   "'See rates', near('Or refinancing a home?')"]],
 [0.0014746076034176311,
  ['goto',
   "'Our Services'",
   "'About Us'",
   "'View Rates'",
   "'Refinance'",
   "'Custom'",
   "near('Max Rate')",
   "near('20 Year')",
   "'Get Started'",
   'location',
   'Address',
   "'Next'",
   'email',
   "'Continue'",
   "'About Us'",
   "'View Rates'",
   "'Refinance'",
   "'Custom'",
   "near('Max Rate')",
   "near('20 Year')",
   "'Get Started'",
   'location',
   'Address',
   "'Next'"]],
 [0.00069950691117,
  ['goto',
   "'Our Services'",
   "'About Us'",
   "'View Rates'",
   "'Refinance'",
   "'Custom'",
   "near('Max Rate')",
   "near('20 Year')",
   "'Get Started'",
   'location',
   'Address',
   "'Next'",
   'email',
   "'Continue'",
   "'Refina

In [None]:
#hmm = HiddenMarkovModel().from_samples(
#    NormalDistribution,n_components=5, X=sequences,end_state=True)