<a href="https://colab.research.google.com/github/mikecabs/VolvoHMM/blob/main/MultinomialHMM_FirstIteration.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

HMM Learn

In [4]:
# Need to install everytime
!pip install hmmlearn



In [5]:
# import packages
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import hmmlearn
sns.set()
# enhance inline image quality
%config InlineBackend.figure_format = 'retina'

In [6]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [7]:
# Note: I rename the folder to HMM since it's Hidden Markov Model, check folder name if this line doesn't work
%cd "/content/drive/My Drive/Volvo HMM Project/"

/content/drive/.shortcut-targets-by-id/1T01a1rxUdrfdS4tfGgwWks4bUw3sfNf0/Volvo HMM Project


Load Data

In [8]:
c_pageseq = pd.read_csv('./Datasets/q4_cPageSeq.csv')
nc_pageseq = pd.read_csv('./Datasets/q4_ncPageSeq.csv')

In [9]:
#Adding in if converted so I can combine the datasets
#The rows in the converted dataset are there because the user converted within a two week span
c_pageseq['Conversion Dataset'] = 1
nc_pageseq['Conversion Dataset'] = 0
c_pageseq['Page Sequence'] = c_pageseq.seq.apply(lambda x: [int(i) for i in x.split(",")])
nc_pageseq['Page Sequence'] = nc_pageseq.seq.apply(lambda x: [int(i) for i in x.split(",")])

In [10]:
#Combining converted and non-converted visitors
nc_pageseq = nc_pageseq.sample(len(c_pageseq))
pageseq = pd.concat([c_pageseq, nc_pageseq])

In [11]:
pageseq['User and Session'] = pageseq.ids.apply(lambda x: [i for i in x.split(',')])
pageseq['User ID'] = pageseq['User and Session'].apply(lambda x: x[1])
pageseq['Session ID'] = pageseq['User and Session'].apply(lambda x: x[0])
pageseq = pageseq.drop(columns = ['ids', 'User and Session', 'seq'])
pageseq = pageseq.set_index('Session ID')

In [12]:
pageseq['Media Channel'] = pageseq['Page Sequence'].apply(lambda x: x[0])

In [13]:
#Adding if the session converted and by which pages to the table
con_ses = []
con_ways = []
for seq in pageseq['Page Sequence']:
  con_ses_bool = 0
  converted_pages = []
  for page in seq:
    if page == 24 or page == 34 or page == 39 or page == 86:
      converted_pages.append(page)
      con_ses_bool = 1
  con_ses.append(con_ses_bool)
  con_ways.append(converted_pages)

In [14]:
pageseq['Converted Session'] = con_ses
pageseq['Conversion Method'] = con_ways

In [15]:
pageseq = pageseq[['User ID', 'Page Sequence', 'Conversion Dataset', 'Converted Session', 'Media Channel', 'Conversion Method']]
#Note: there are some sequences with a conversion action that are not in the conversion dataset because 86 is not identified as converted
display(pageseq.head())

Unnamed: 0_level_0,User ID,Page Sequence,Conversion Dataset,Converted Session,Media Channel,Conversion Method
Session ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1569880989,4591098790048594944,"[126, 74, 84, 74, 84, 24]",1,1,126,[24]
1569881650,8163510663962056704,"[125, 95, 84, 24]",1,1,125,[24]
1569882913,5836022827757765632,"[125, 77, 5, 74, 5, 74, 85, 90, 93, 99, 91, 92...",1,1,125,"[86, 24]"
1569883055,3586855111765427200,"[130, 79, 84, 79, 85, 90, 93, 99, 91, 92, 96, ...",1,1,130,"[86, 24]"
1569883489,906571704020631040,"[125, 85, 90, 93, 99, 91, 92, 96, 95, 96, 95, ...",1,1,125,"[86, 86, 24]"


Creating a Starting Probability and Emmissions Matrix

In [16]:
#Creating a dictionary of the number of times a converted user enters a page
c_pages = {}
just_c = pageseq[pageseq['Converted Session']==1]
for seq in just_c['Page Sequence']:
  for page in seq:
      prev_val = c_pages.get(page)
      if prev_val == None:
        c_pages[page] = 1
      else:
        c_pages[page] = prev_val + 1
#Creating a dictionary of the number of times a nonconverted user enters a page
nc_pages = {}
just_nc = pageseq[pageseq['Converted Session']==0]
for seq in just_nc['Page Sequence']:
  for page in seq:
      prev_val = nc_pages.get(page)
      if prev_val == None:
        nc_pages[page] = 1
      else:
        nc_pages[page] = prev_val + 1

In [17]:
#Creating a dictionary of the probability a converted user started on a specific page
startingpage = {}
for i, page in enumerate(pageseq['Media Channel']):
  is_converted = pageseq['Converted Session'][i]
  prev_val = startingpage.get(page)
  if prev_val == None:
    if is_converted:
      startingpage[page] = (1,1)
    else:
      startingpage[page] = (0,1)
  else:
    if is_converted:
      startingpage.update({page:(prev_val[0]+1, prev_val[1]+1)})
    else:
      startingpage.update({page:(prev_val[0], prev_val[1]+1)})

In [18]:
#Creating the emissions matrix where the first row is if the user is not converted
total_cpages = sum(c_pages.values())
total_ncpages = sum(nc_pages.values())
emission_matrix = [[0.0]*199]*2
for i in [0,1]:
  for j in np.arange(199):
    if i:
      val = c_pages.get(j)
      if val == None:
        val = 0
      probability = val / total_cpages
      emission_matrix[i][j] = probability
    else:
      val = nc_pages.get(j)
      if val == None:
        val = 0
      probability = val / total_ncpages
      emission_matrix[i][j] = probability

In [19]:
#Creating the Starting Probability Matrix
starting_matrix = [0.0]*199
for i in np.arange(199):
    val = startingpage.get(i)
    if val:
      probability = val[0] / val[1]
      starting_matrix[i] = probability

Attempting to fit MultinomialHMM

In [20]:
from hmmlearn.hmm import MultinomialHMM

Reference: https://hmmlearn.readthedocs.io/en/latest/tutorial.html#training-hmm-parameters-and-inferring-the-hidden-states

In [21]:
list_of_sequences = []
for i in pageseq['Page Sequence']:
  s = []
  for j in i:
    s.append([j])
  list_of_sequences.append(s)
seqs_list = np.concatenate(list_of_sequences)
seqs_len = [len(i) for i in list_of_sequences]

In [22]:
hmm_table = pageseq[['User ID', 'Converted Session']]
hmm_table['Page List'] = list_of_sequences

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [23]:
hmm_table

Unnamed: 0_level_0,User ID,Converted Session,Page List
Session ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1569880989,4591098790048594944,1,"[[126], [74], [84], [74], [84], [24]]"
1569881650,8163510663962056704,1,"[[125], [95], [84], [24]]"
1569882913,5836022827757765632,1,"[[125], [77], [5], [74], [5], [74], [85], [90]..."
1569883055,3586855111765427200,1,"[[130], [79], [84], [79], [85], [90], [93], [9..."
1569883489,906571704020631040,1,"[[125], [85], [90], [93], [99], [91], [92], [9..."
...,...,...,...
1575659489,9051112861671794688,0,"[[125], [77], [7], [81], [80], [89]]"
1577225284,3788124383139236864,0,"[[126], [81], [80]]"
1571442544,7978353661760361472,1,"[[125], [87], [86], [87]]"
1571841895,1835028871891674880,0,"[[127], [77], [82], [11], [6], [11], [78], [97]]"


In [24]:
hmm = MultinomialHMM(n_components=2, init_params='se')

In [25]:
#Will not finish running
hmm.startprob_ = np.array(starting_matrix)
hmm.emissionprob_ = np.array(emission_matrix)

In [26]:
#Fitting to a model where the emission and starting matrix can be altered
hmm.fit(seqs_list, seqs_len)

Even though the 'startprob_' attribute is set, it will be overwritten during initialization because 'init_params' contains 's'


MultinomialHMM(algorithm='viterbi', init_params='se', n_components=2, n_iter=10,
               params='ste',
               random_state=RandomState(MT19937) at 0x7F5C82AFC780,
               startprob_prior=1.0, tol=0.01, transmat_prior=1.0,
               verbose=False)

In [27]:
#Saving the model 
import pickle
with open("./Datasets/Multinomial_2n_1000iter.pkl", "wb") as file: pickle.dump(hmm, file)

In [28]:
hmm.monitor_.converged

True

In [29]:
hmm.get_stationary_distribution()

array([0.09125523, 0.90874477])

In [30]:
#Getting the most likely path of a converted user
converted_sequences = []
for i in pageseq[pageseq['Converted Session']==1]['Page Sequence']:
  s = []
  for j in i:
    s.append([j])
  converted_sequences.append(s)

best_score = -1000000
best_sequence = []
for i,seq in enumerate(converted_sequences):
  score = hmm.score(seq)
  if score > best_score:
    best_score = score
    best_sequence = converted_sequences[i]

In [31]:
best_sequence

[[125], [24]]

In [32]:
best_score

-5.583614527843257

In [33]:
hmm.decode(best_sequence)

(-5.6148537057782555, array([0, 1]))

In [34]:
#Getting the most likely path of a nonconverted user
nonconverted_sequences = []
for i in pageseq[pageseq['Converted Session']==0]['Page Sequence']:
  s = []
  for j in i:
    s.append([j])
  nonconverted_sequences.append(s)

best_nscore = -1000000
best_nsequence = []
for i,seq in enumerate(nonconverted_sequences):
  score = hmm.score(seq)
  if score > best_nscore:
    best_nscore = score
    best_nsequence = nonconverted_sequences[i]

In [35]:
best_nscore

-5.009889393808022

In [36]:
best_nsequence

[[125], [77]]

In [37]:
hmm.decode(best_nsequence)

(-5.338700816792741, array([0, 0]))

In [38]:
list_scores = [hmm.score(i) for i in hmm_table['Page List']]
hmm_table['Score'] = list_scores

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [39]:
hmm_table = hmm_table.sort_values('Score', ascending=False).drop_duplicates(['Score'])

In [40]:
hmm_table[hmm_table['Converted Session']==1][:20]

Unnamed: 0_level_0,User ID,Converted Session,Page List,Score
Session ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1571795906,8057262334226176000,1,"[[125], [24]]",-5.583615
1572634913,5569061000809496576,1,"[[127], [86]]",-5.595411
1572052111,2366803702114326528,1,"[[126], [24]]",-5.656997
1573908375,6536067338407500800,1,"[[127], [24]]",-5.699612
1572982438,5182842934369292288,1,"[[126], [39]]",-6.294244
1569925559,497671481498709376,1,"[[130], [24]]",-6.368394
1576684527,7581896023924689920,1,"[[125], [34]]",-7.404224
1577463085,6669101218805455872,1,"[[126], [34]]",-7.477607
1575834213,1365451679288016384,1,"[[127], [34]]",-7.520221
1577410875,256452520296306528,1,"[[129], [24]]",-7.98269


In [41]:
best_sc = -10000
best_seq = []
for i in np.arange(100000):
  sequence1 = hmm.sample(5)[0]
  score1 = hmm.score(sequence1)
  if score1 > best_sc:
    best_seq = sequence1
    best_sc = score1

In [42]:
best_seq

array([[125],
       [ 90],
       [ 90],
       [ 90],
       [ 90]])

In [43]:
hmm.decode(best_seq)

(-13.10102456431626, array([0, 1, 1, 1, 1]))