In [1]:
import numpy as np
import matplotlib.pyplot as plt 

import pandas as pd
from collections import Counter

In [2]:
# read in seq fact dataset

df = pd.read_csv('../datasets/sequence_fact.csv')
df.head(10)

Unnamed: 0,sequence_id,fullVisitorId,event_name,event_datetime,conversion_proximity
0,0099Rqojoj1MCXN,7343617347507729080,organic_search,2018-04-15 17:31:50,75.0
1,0099Rqojoj1MCXN,7343617347507729080,dead_end,2018-04-15 17:33:05,0.0
2,00A9Lkka73okUx2,89656057821147903,organic_search,2017-09-14 16:36:56,1033.0
3,00A9Lkka73okUx2,89656057821147903,dead_end,2017-09-14 16:54:09,0.0
4,00B30tmbMwJn7Cf,4307745811624101170,organic_search,2017-04-21 02:41:23,1.0
5,00B30tmbMwJn7Cf,4307745811624101170,dead_end,2017-04-21 02:41:24,0.0
6,00BKxKnEYlKbw9b,7129167701457127936,organic_search,2016-10-02 15:16:09,1.0
7,00BKxKnEYlKbw9b,7129167701457127936,dead_end,2016-10-02 15:16:10,0.0
8,00EttOfsTTyp45B,3217678225016118393,referral,2017-10-23 19:44:20,143.0
9,00EttOfsTTyp45B,3217678225016118393,dead_end,2017-10-23 19:46:43,0.0


In [3]:
def convert_to_tuples(l):
    new_l = []
    for i in range(len(l)-1):
        new_l.append((l[i], l[i+1]))
    return new_l

In [4]:
# filter touchpoints older than 45 days

df1 = df.loc[(df['conversion_proximity']/86400)<=45,:]
df1 = df1.sort_values(by=['sequence_id', 'event_datetime'])
df2 = df1.groupby('sequence_id')['event_name'].agg(lambda x: '>'.join(x)).reset_index()
df3 = df2.copy()
df3['event_name'] = 'start>'+df3['event_name']

df3.head()

Unnamed: 0,sequence_id,event_name
0,0099Rqojoj1MCXN,start>organic_search>dead_end
1,00A9Lkka73okUx2,start>organic_search>dead_end
2,00B30tmbMwJn7Cf,start>organic_search>dead_end
3,00BKxKnEYlKbw9b,start>organic_search>dead_end
4,00EttOfsTTyp45B,start>referral>dead_end


In [5]:
# create transistion matrix

df3['paths'] = df3['event_name'].apply(lambda x: convert_to_tuples(x.split('>')))
paths = df3['paths'].explode().to_list()


matrix = pd.Series(Counter(paths)).unstack().fillna(0)
tm = matrix.divide(matrix.sum(axis=1),axis=0)

channels = tm.index.tolist()
channels.remove('start')

tm = tm[['conversion', 'dead_end'] + channels]

tm

Unnamed: 0,conversion,dead_end,(other),affiliates,direct,display,organic_search,paid_search,referral,social
(other),0.0,0.8,0.2,0.0,0.0,0.0,0.0,0.0,0.0,0.0
affiliates,0.0,0.80494,0.0,0.147785,0.0,0.000426,0.014906,0.0,0.029387,0.002555
direct,0.011031,0.78249,0.0,0.001606,0.168461,0.000803,0.014297,0.001821,0.016867,0.002624
display,0.003412,0.745522,0.0,0.0,0.000569,0.230594,0.007677,0.004834,0.007393,0.0
organic_search,0.007944,0.854814,0.0,0.001194,7.8e-05,0.001604,0.120747,0.003522,0.008218,0.001878
paid_search,0.020988,0.707177,0.0,0.000339,0.0,0.005755,0.079892,0.178741,0.005755,0.001354
referral,0.045536,0.695282,0.0,0.002005,7.2e-05,0.002005,0.017327,0.000931,0.234266,0.002578
social,0.000598,0.947795,0.0,0.000199,4e-05,0.00016,0.002313,0.000239,0.001476,0.04718
start,0.0,0.0,4e-05,0.018813,0.155649,0.025662,0.441956,0.021822,0.098387,0.23767


https://www.databricks.com/notebooks/multi_touch_attribution/index.html#04_markov_chains.html