In [10]:
import pandas as pd
import numpy as np
from collections import defaultdict
import matplotlib.pyplot as plt
import seaborn as sns

df = pd.read_csv('attribution data.csv')
df = df.sort_values(['cookie', 'time'],
                    ascending=[False, True])
df['visit_order'] = df.groupby('cookie').cumcount() + 1

In [11]:
df.head()

Unnamed: 0,cookie,time,interaction,conversion,conversion_value,channel,visit_order
586736,ooooohAFofEnonEikhAi3fF9o,2018-07-14T17:17:12Z,impression,0,0.0,Paid Search,1
586734,ooooiBh70D3k3BfAhDFfii9h7,2018-07-03T12:57:25Z,impression,0,0.0,Paid Search,1
586735,ooooiBh70D3k3BfAhDFfii9h7,2018-07-19T08:17:59Z,impression,0,0.0,Online Video,2
586731,ooooEiB0CCoEf9fiiC90Dfhfk,2018-07-06T23:30:38Z,impression,0,0.0,Online Display,1
586732,ooooEiB0CCoEf9fiiC90Dfhfk,2018-07-12T23:50:45Z,impression,0,0.0,Online Display,2


# Path creation

In [12]:
df_paths = df.groupby('cookie')['channel'].aggregate(
    lambda x: x.unique().tolist()).reset_index()

In [13]:
df_last_interaction = df.drop_duplicates('cookie', keep='last')[['cookie', 'conversion']]

In [14]:
df_paths = pd.merge(df_paths, df_last_interaction, how='left', on='cookie')

In [15]:
df_paths

Unnamed: 0,cookie,channel,conversion
0,00000FkCnDfDDf0iC97iC703B,"[Instagram, Online Display]",0
1,0000nACkD9nFkBBDECD3ki00E,[Paid Search],0
2,0003EfE37E93D0BC03iBhBBhF,[Paid Search],0
3,00073CFE3FoFCn70fBhB3kfon,[Instagram],0
4,00079hhBkDF3k3kDkiFi9EFAD,[Paid Search],0
...,...,...,...
240103,ooooE0hkAFBkED90ChDDiBFAf,[Online Display],0
240104,ooooEBE0o0D97ACAAAnDoi3F0,[Online Display],0
240105,ooooEiB0CCoEf9fiiC90Dfhfk,[Online Display],0
240106,ooooiBh70D3k3BfAhDFfii9h7,"[Paid Search, Online Video]",0


In [16]:
df_paths['path'] = np.where( df_paths['conversion'] == 0,
['Start, '] + df_paths['channel'].apply(', '.join) + [', Null'],
['Start, '] + df_paths['channel'].apply(', '.join) + [', Conversion'])


df_paths['path'] = df_paths['path'].str.split(', ')

df_paths = df_paths[['cookie', 'path', 'conversion']]

# 10 most common paths

In [35]:
df_paths['string'] = df_paths['path'].apply(lambda x: ' -> '.join(x))

In [49]:
df_paths.groupby('string')['conversion'].count().sort_values(ascending=False).head(10)

string
Start -> Paid Search -> Null                      59396
Start -> Facebook -> Null                         43568
Start -> Online Video -> Null                     26402
Start -> Online Display -> Null                   25997
Start -> Instagram -> Null                        15659
Start -> Facebook -> Instagram -> Null            10470
Start -> Instagram -> Facebook -> Null             7467
Start -> Paid Search -> Conversion                 3757
Start -> Paid Search -> Facebook -> Null           3356
Start -> Paid Search -> Online Display -> Null     3101
Name: conversion, dtype: int64

## Paths with more conversions

In [48]:
df_paths[df_paths.conversion == 1].groupby('string')['conversion'].count().sort_values(ascending=False).head(10)

string
Start -> Paid Search -> Conversion                      3757
Start -> Facebook -> Conversion                         3001
Start -> Online Video -> Conversion                     2634
Start -> Online Display -> Conversion                   1578
Start -> Facebook -> Instagram -> Conversion            1345
Start -> Instagram -> Conversion                        1028
Start -> Instagram -> Facebook -> Conversion             914
Start -> Online Display -> Paid Search -> Conversion     271
Start -> Paid Search -> Online Display -> Conversion     260
Start -> Paid Search -> Facebook -> Conversion           253
Name: conversion, dtype: int64

# Markov Chains

In [54]:
list_of_paths = df_paths['path']
total_conversions = sum(path.count('Conversion') for path in df_paths['path'].tolist())
base_conversion_rate = total_conversions / len(list_of_paths)

## Transition states

In [60]:
def transition_states(list_of_paths):
    list_of_unique_channels = set(x for element in list_of_paths for x in element)
    transition_states = {x + '>' + y: 0 for x in list_of_unique_channels for y in list_of_unique_channels}

    for possible_state in list_of_unique_channels:
        if possible_state not in ['Conversion', 'Null']:
            for user_path in list_of_paths:
                if possible_state in user_path:
                    indices = [i for i, s in enumerate(user_path) if possible_state in s]
                    for col in indices:
                        transition_states[user_path[col] + '>' + user_path[col + 1]] += 1

    return transition_states


trans_states = transition_states(list_of_paths)

## Transition probabilities

In [65]:
def transition_prob(trans_dict):
    list_of_unique_channels = set(x for element in list_of_paths for x in element)
    trans_prob = defaultdict(dict)
    for state in list_of_unique_channels:
        if state not in ['Conversion', 'Null']:
            counter = 0
            index = [i for i, s in enumerate(trans_dict) if state + '>' in s]
            for col in index:
                if trans_dict[list(trans_dict)[col]] > 0:
                    counter += trans_dict[list(trans_dict)[col]]
            for col in index:
                if trans_dict[list(trans_dict)[col]] > 0:
                    state_prob = float((trans_dict[list(trans_dict)[col]])) / float(counter)
                    trans_prob[list(trans_dict)[col]] = state_prob

    return trans_prob


trans_prob = transition_prob(trans_states)

In [67]:
trans_prob

defaultdict(dict,
            {'Paid Search>Instagram': 0.03419630796938316,
             'Paid Search>Null': 0.7707789284106259,
             'Paid Search>Conversion': 0.053309320126069336,
             'Paid Search>Online Display': 0.04805267897343539,
             'Paid Search>Facebook': 0.06444169293111211,
             'Paid Search>Online Video': 0.029221071589374155,
             'Instagram>Paid Search': 0.04580924271216593,
             'Instagram>Null': 0.6297861157158452,
             'Instagram>Conversion': 0.057979218048760765,
             'Instagram>Online Display': 0.023531314438199977,
             'Instagram>Facebook': 0.2187765333764606,
             'Instagram>Online Video': 0.024117575708567502,
             'Start>Paid Search': 0.31739883718993117,
             'Start>Instagram': 0.11918803205224315,
             'Start>Online Display': 0.14264414346877238,
             'Start>Facebook': 0.2784080497109634,
             'Start>Online Video': 0.14236093757808985,
   

In [68]:
def transition_matrix(list_of_paths, transition_probabilities):
    trans_matrix = pd.DataFrame()
    list_of_unique_channels = set(x for element in list_of_paths for x in element)

    for channel in list_of_unique_channels:
        trans_matrix[channel] = 0.00
        trans_matrix.loc[channel] = 0.00
        trans_matrix.loc[channel][channel] = 1.0 if channel in ['Conversion', 'Null'] else 0.0

    for key, value in transition_probabilities.items():
        origin, destination = key.split('>')
        trans_matrix.at[origin, destination] = value

    return trans_matrix


trans_matrix = transition_matrix(list_of_paths, trans_prob)

In [69]:
trans_matrix

Unnamed: 0,Paid Search,Instagram,Start,Null,Conversion,Online Display,Facebook,Online Video
Paid Search,0.0,0.034196,0.0,0.770779,0.053309,0.048053,0.064442,0.029221
Instagram,0.045809,0.0,0.0,0.629786,0.057979,0.023531,0.218777,0.024118
Start,0.317399,0.119188,0.0,0.0,0.0,0.142644,0.278408,0.142361
Null,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
Conversion,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
Online Display,0.092386,0.029363,0.0,0.756643,0.050324,0.0,0.053986,0.017299
Facebook,0.050655,0.173072,0.0,0.673123,0.05322,0.024352,0.0,0.025577
Online Video,0.047947,0.031908,0.0,0.763757,0.078146,0.01892,0.059323,0.0


In [70]:
def removal_effects(df, conversion_rate):
    removal_effects_dict = {}
    channels = [channel for channel in df.columns if channel not in ['Start',
                                                                     'Null',
                                                                     'Conversion']]
    for channel in channels:
        removal_df = df.drop(channel, axis=1).drop(channel, axis=0)
        for column in removal_df.columns:
            row_sum = np.sum(list(removal_df.loc[column]))
            null_pct = float(1) - row_sum
            if null_pct != 0:
                removal_df.loc[column]['Null'] = null_pct
            removal_df.loc['Null']['Null'] = 1.0

        removal_to_conv = removal_df[
            ['Null', 'Conversion']].drop(['Null', 'Conversion'], axis=0)
        removal_to_non_conv = removal_df.drop(
            ['Null', 'Conversion'], axis=1).drop(['Null', 'Conversion'], axis=0)

        removal_inv_diff = np.linalg.inv(
            np.identity(
                len(removal_to_non_conv.columns)) - np.asarray(removal_to_non_conv))
        removal_dot_prod = np.dot(removal_inv_diff, np.asarray(removal_to_conv))
        removal_cvr = pd.DataFrame(removal_dot_prod,
                                   index=removal_to_conv.index)[[1]].loc['Start'].values[0]
        removal_effect = 1 - removal_cvr / conversion_rate
        removal_effects_dict[channel] = removal_effect

    return removal_effects_dict


removal_effects_dict = removal_effects(trans_matrix, base_conversion_rate)

In [71]:
removal_effects_dict

{'Paid Search': 0.3311037560086154,
 'Instagram': 0.21731366149038456,
 'Online Display': 0.15435482356041286,
 'Facebook': 0.3547597674182721,
 'Online Video': 0.2069141165564219}

In [72]:
def markov_chain_allocations(removal_effects, total_conversions):
    re_sum = np.sum(list(removal_effects.values()))

    return {k: (v / re_sum) * total_conversions for k, v in removal_effects.items()}


attributions = markov_chain_allocations(removal_effects_dict, total_conversions)

In [73]:
attributions

{'Paid Search': 4618.891257291355,
 'Instagram': 3031.5215485558924,
 'Online Display': 2153.2469267590836,
 'Facebook': 4948.892177847522,
 'Online Video': 2886.448089546147}