## Estimando la cadena de Markov de la trayectoria de usuarios

Para cada cliente and visita, el dataset contiene la siguiente información:
- Cookie: Clave única generada aleatoriamente para identificar a los usuarios/clientes de la web
- Timestamp: Instante de la visita
- Interaction: Variable categórica que identifica la interacción del visitante con la web
- Conversion: `True` o `False` indicando si o no se ha convertido al visitante
- Conversion Value: Tipo de conversión que ha ocurrido
- Channel: El canal que ha atraido al usuario

In [41]:
import pandas as pd
import numpy as np
from collections import defaultdict

df = pd.read_csv(
    filepath_or_buffer='data.csv'
).sort_values(
    ['cookie', 'time'], 
    ascending=[False, True]
)

df['visit_order'] = df.groupby('cookie').cumcount() + 1

In [46]:
df

Unnamed: 0,cookie,time,interaction,conversion,conversion_value,channel,visit_order
586736,ooooohAFofEnonEikhAi3fF9o,2018-07-14T17:17:12Z,impression,0,0.0,Paid Search,1
586734,ooooiBh70D3k3BfAhDFfii9h7,2018-07-03T12:57:25Z,impression,0,0.0,Paid Search,1
586735,ooooiBh70D3k3BfAhDFfii9h7,2018-07-19T08:17:59Z,impression,0,0.0,Online Video,2
586731,ooooEiB0CCoEf9fiiC90Dfhfk,2018-07-06T23:30:38Z,impression,0,0.0,Online Display,1
586732,ooooEiB0CCoEf9fiiC90Dfhfk,2018-07-12T23:50:45Z,impression,0,0.0,Online Display,2
...,...,...,...,...,...,...,...
9,0000nACkD9nFkBBDECD3ki00E,2018-07-11T22:19:53Z,impression,0,0.0,Paid Search,6
0,00000FkCnDfDDf0iC97iC703B,2018-07-03T13:02:11Z,impression,0,0.0,Instagram,1
1,00000FkCnDfDDf0iC97iC703B,2018-07-17T19:15:07Z,impression,0,0.0,Online Display,2
2,00000FkCnDfDDf0iC97iC703B,2018-07-24T15:51:46Z,impression,0,0.0,Online Display,3


In [47]:
df.shape

(586737, 7)

In [48]:
df_paths = df.groupby('cookie')['channel'].aggregate(lambda x: x.unique().tolist()).reset_index()

In [49]:
df_last_interaction = df.drop_duplicates('cookie', keep='last')[['cookie', 'conversion']]

In [50]:
df_paths = pd.merge(df_paths, df_last_interaction, how='left', on='cookie')

In [51]:
def decorate_paths(
    channel_list,
    null_condition,
    start_state="Start",
    null_state="Null",
    converted_state="Conversion",
):
    out = [start_state] + channel_list
    return out + ([null_state] if null_condition else [converted_state]) 
        
full_paths = [
    decorate_paths(e["channel"], e["conversion"]==0)
    for idx, e in df_paths.iterrows()
]

In [52]:
df_paths["path"] = full_paths

In [53]:
df_paths = df_paths[["cookie", "path"]]
df_paths.head()

Unnamed: 0,cookie,path
0,00000FkCnDfDDf0iC97iC703B,"[Start, Instagram, Online Display, Null]"
1,0000nACkD9nFkBBDECD3ki00E,"[Start, Paid Search, Null]"
2,0003EfE37E93D0BC03iBhBBhF,"[Start, Paid Search, Null]"
3,00073CFE3FoFCn70fBhB3kfon,"[Start, Instagram, Null]"
4,00079hhBkDF3k3kDkiFi9EFAD,"[Start, Paid Search, Null]"


**Cadenas de Markov**
En pocas palabras, el estudio de cadenas de Markov se puede reducir a 2 grandes pasos:
    
    1. Calcular la probabilidad de transición entre estados
    2. Calcular efectos de eliminación

### 1. Calcular la probabilidad de transición entre estados

In [56]:
list_of_paths = df_paths['path']

total_conversions = sum(
    path.count('Conversion')
    for path in df_paths['path'].tolist()
)

base_conversion_rate = total_conversions / len(list_of_paths)

Podemos comprobar que la tasa base de conversión es:

In [57]:
total_conversions

17639

In [58]:
base_conversion_rate

0.07346277508454528

In [59]:
def transition_states(
    list_of_paths,
    null_state = "Null",
    converted_state = "Conversion"
):
    list_of_unique_channels = set(x for element in list_of_paths for x in element)

    transition_states = {
        x + ' => ' + y: 0 
        for x in list_of_unique_channels 
        for y in list_of_unique_channels
    }

    frontier_states = [null_state, converted_state]
    
    for possible_state in list_of_unique_channels:
        if possible_state not in frontier_states:
            for user_path in list_of_paths:
                if possible_state in user_path:
                    indices = [
                        i for i, s in enumerate(user_path) 
                        if possible_state in s
                    ]
                    for col in indices:
                        transition_states[user_path[col] + ' => ' + user_path[col + 1]] += 1

    return transition_states

transitions = transition_states(list_of_paths)
transitions

{'Null => Null': 0,
 'Null => Paid Search': 0,
 'Null => Online Video': 0,
 'Null => Instagram': 0,
 'Null => Facebook': 0,
 'Null => Conversion': 0,
 'Null => Start': 0,
 'Null => Online Display': 0,
 'Paid Search => Null': 68476,
 'Paid Search => Paid Search': 0,
 'Paid Search => Online Video': 2596,
 'Paid Search => Instagram': 3038,
 'Paid Search => Facebook': 5725,
 'Paid Search => Conversion': 4736,
 'Paid Search => Start': 0,
 'Paid Search => Online Display': 4269,
 'Online Video => Null': 31285,
 'Online Video => Paid Search': 1964,
 'Online Video => Online Video': 0,
 'Online Video => Instagram': 1307,
 'Online Video => Facebook': 2430,
 'Online Video => Conversion': 3201,
 'Online Video => Start': 0,
 'Online Video => Online Display': 775,
 'Instagram => Null': 31153,
 'Instagram => Paid Search': 2266,
 'Instagram => Online Video': 1193,
 'Instagram => Instagram': 0,
 'Instagram => Facebook': 10822,
 'Instagram => Conversion': 2868,
 'Instagram => Start': 0,
 'Instagram => On

In [60]:
def transition_prob(trans_dict):
    list_of_unique_channels = set(x for element in list_of_paths for x in element)
    trans_prob = defaultdict(dict)
    
    for state in list_of_unique_channels:
        if state not in ['Conversion', 'Null']:
            counter = 0
            
            index = [i for i, s in enumerate(trans_dict) if state + ' => ' in s]
            
            counter = np.sum(
                [
                    trans_dict[list(trans_dict)[col]]
                        if trans_dict[list(trans_dict)[col]] > 0 
                        else 0
                    for col in index
                ]
            )
            
            for col in index:
                if trans_dict[list(trans_dict)[col]] > 0:
                    state_prob = float((trans_dict[list(trans_dict)[col]])) / float(counter)
                    trans_prob[list(trans_dict)[col]] = state_prob

    return trans_prob


transitions_prob = transition_prob(transitions)
transitions_prob

defaultdict(dict,
            {'Paid Search => Null': 0.7707789284106259,
             'Paid Search => Online Video': 0.029221071589374155,
             'Paid Search => Instagram': 0.03419630796938316,
             'Paid Search => Facebook': 0.06444169293111211,
             'Paid Search => Conversion': 0.053309320126069336,
             'Paid Search => Online Display': 0.04805267897343539,
             'Online Video => Null': 0.7637566525072018,
             'Online Video => Paid Search': 0.04794687759386749,
             'Online Video => Instagram': 0.03190762169815927,
             'Online Video => Facebook': 0.059323275230701626,
             'Online Video => Conversion': 0.0781455983594551,
             'Online Video => Online Display': 0.018919974610614718,
             'Instagram => Null': 0.6297861157158452,
             'Instagram => Paid Search': 0.04580924271216593,
             'Instagram => Online Video': 0.024117575708567502,
             'Instagram => Facebook': 0.218776

In [61]:
def transition_matrix(
    list_of_paths,
    transition_probabilities,
    null_state = "Null",
    converted_state = "Conversion"
):
    trans_matrix = pd.DataFrame()
    list_of_unique_channels = set(x for element in list_of_paths for x in element)

    frontier_states =[null_state, converted_state]
    
    for channel in list_of_unique_channels:
        trans_matrix[channel] = 0.00
        trans_matrix.loc[channel] = 0.00
        trans_matrix.loc[channel][channel] = 1.0 if channel in frontier_states else 0.0

    for key, value in transition_probabilities.items():
        origin, destination = key.split(' => ')
        trans_matrix.at[origin, destination] = value

    return trans_matrix


T_matrix = transition_matrix(list_of_paths, transitions_prob)

In [68]:
T_matrix.T

Unnamed: 0,Null,Paid Search,Online Video,Instagram,Facebook,Conversion,Start,Online Display
Null,1.0,0.770779,0.763757,0.629786,0.673123,0.0,0.0,0.756643
Paid Search,0.0,0.0,0.047947,0.045809,0.050655,0.0,0.317399,0.092386
Online Video,0.0,0.029221,0.0,0.024118,0.025577,0.0,0.142361,0.017299
Instagram,0.0,0.034196,0.031908,0.0,0.173072,0.0,0.119188,0.029363
Facebook,0.0,0.064442,0.059323,0.218777,0.0,0.0,0.278408,0.053986
Conversion,0.0,0.053309,0.078146,0.057979,0.05322,1.0,0.0,0.050324
Start,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Online Display,0.0,0.048053,0.01892,0.023531,0.024352,0.0,0.142644,0.0


### 2. Calcular los efectos de eliminación

In [69]:
def removal_effects(df, conversion_rate):
    removal_effects_dict = {}
    channels = [channel for channel in df.columns if channel not in ['Start',
                                                                     'Null',
                                                                     'Conversion']]
    for channel in channels:
        removal_df = df.drop(channel, axis=1).drop(channel, axis=0)
        for column in removal_df.columns:
            row_sum = np.sum(list(removal_df.loc[column]))
            null_pct = float(1) - row_sum
            if null_pct != 0:
                removal_df.loc[column]['Null'] = null_pct
            removal_df.loc['Null']['Null'] = 1.0

        removal_to_conv = removal_df[
            ['Null', 'Conversion']
        ].drop(['Null', 'Conversion'], axis=0)

        removal_to_non_conv = removal_df.drop(
            ['Null', 'Conversion'], axis=1
        ).drop(['Null', 'Conversion'], axis=0)

        removal_inv_diff = np.linalg.inv(
            np.identity(
                len(removal_to_non_conv.columns)
            ) - np.asarray(removal_to_non_conv)
        )
        
        removal_dot_prod = np.dot(removal_inv_diff, np.asarray(removal_to_conv))
        removal_cvr = pd.DataFrame(removal_dot_prod,
                                   index=removal_to_conv.index)[[1]].loc['Start'].values[0]
        
        removal_effect = 1 - removal_cvr / conversion_rate
        removal_effects_dict[channel] = removal_effect

    return removal_effects_dict


removal_effects_dict = removal_effects(T_matrix, base_conversion_rate)

In [70]:
removal_effects_dict

{'Paid Search': 0.3311037560086154,
 'Online Video': 0.20691411655642178,
 'Instagram': 0.21731366149038456,
 'Facebook': 0.3547597674182722,
 'Online Display': 0.15435482356041286}

In [72]:
def markov_chain_allocations(removal_effects, total_conversions):
    re_sum = np.sum(list(removal_effects.values()))
    return {k: (v / re_sum) * total_conversions for k, v in removal_effects.items()}


attributions = markov_chain_allocations(removal_effects_dict, total_conversions)

In [74]:
attributions

{'Paid Search': 4618.891257291355,
 'Online Video': 2886.448089546145,
 'Instagram': 3031.5215485558924,
 'Facebook': 4948.892177847523,
 'Online Display': 2153.2469267590836}

In [75]:
ddf = pd.DataFrame(
    {
        "channel": k,
        "n": v
    }
    for k,v in attributions.items()
).sort_values(by="n", ascending=False)

In [76]:
import plotly.express as px
fig = px.bar(ddf, x="channel", y="n", color="n")
fig.show()