In [None]:
from pdstools import IH
from pdstools.utils import cdh_utils

import polars as pl
import plotly.io as pio
import plotly as plotly

plotly.offline.init_notebook_mode()
pio.renderers.default = "vscode"

# Example IH Analysis

Interaction History (IH) is a rich source of data at the level of individual interactions. It contains the time of the interaction, the channel, the actions/treatments, the customer ID and is used to track different types of outcomes (decisions, sends, opens, clicks, etc). It does **not** contain customer attributes - only the IDs.

This notebook gives some examples of data analysis on IH. It uses plotly (visualizations) and polars (dataframe) but the purpose is more to serve example analyses than re-usable code. All of the analyses should be able to be replicated easily in other analytical BI environments - except perhaps the analysis of model performance / AUC.

This notebook uses sample data shipped with PDStools. Replace with actual IH data.

In [2]:
ih = IH.from_ds_export(
    "../../data/Data-pxStrategyResult_pxInteractionHistory_20210101T010000_GMT.zip"
)

Preview of the raw IH data

In [None]:
ih.data.head().collect()

The same interaction can occur multiple times: once when the first decision is made, then later when responses are captured (accepted, sent, clicked, etc.). For some of the analyses it makes more sense to group by interaction first. This is how that data looks like:

In [None]:
ih.aggregates._summary_interactions(by=["Channel"]).head().collect()

# Distribution Analysis

A distribution of the offers (actions/treatments) is often the most obvious type of analysis. You can do an action distribution for specific outcomes (what is offered, what is accepted), view it conditionally (what got offered last month vs this month) - possibly with a delta view, or over time.

In [None]:
# df.groupby(['pyIssue', 'pyGroup', 'pyDirection', 'pyChannel', 'pyName', 'pyOutcome']).count()[[
#     'pxInteractionID']].rename(columns={'pxInteractionID':'Count'})

# TODO tree map
import plotly.express as px

plot_data = ih.aggregates.summary_outcomes(
    by=["Issue", "Group", "Direction", "Channel", "Name"]
).collect()
fig = px.treemap(
    plot_data,
    path=[px.Constant("ALL")]
    + ["Outcome"]
    + ["Issue", "Group", "Direction", "Channel", "Name"],
    values="Count",
    color="Count",
    branchvalues="total",
    # color_continuous_scale=px.colors.sequential.RdBu_r,
    # title=title,
    # hover_data=[
    #     f"StdErr_{metric}",
    #     f"Positives_{metric}",
    #     f"Negatives_{metric}",
    # ],
    height=640,
    template="pega",
)
fig.update_coloraxes(showscale=False)
fig.update_traces(textinfo="label+value+percent parent")
fig.update_layout(margin=dict(t=50, l=25, r=25, b=25))
fig

In [None]:
plot_data=ih.data.group_by(["Name"]).agg(
    pl.col.Name.filter(pl.col.Outcome=="Clicked").len().alias("Count")
).filter(pl.col.Count.is_not_null() & pl.col.Count != 0).sort("Count")
px.bar(plot_data.collect(),
       x="Count",
       y="Name",
       template="pega",
       title="Action Distribution")

# Responses

A simple view of the responses over time.

In [None]:
outcomes = [
    c
    for c in ih.data.select(pl.col.Outcome.unique().sort())
    .collect()["Outcome"]
    .to_list()
    if c is not None and c != ""
]
plot_data = (
    ih.data.with_columns(pl.col.OutcomeTime.dt.truncate("1d"))
    .group_by(["OutcomeTime", "Channel"])
    .agg([(pl.col.Outcome == o).sum().alias(o) for o in outcomes])
    .collect()
    .unpivot(
        index=["OutcomeTime", "Channel"], variable_name="Outcome", value_name="Count"
    )
)

px.bar(
    plot_data,
    x="OutcomeTime",
    y="Count",
    color="Outcome",
    template="pega",
    title="Daily Responses",
    facet_row="Channel",
)

# Success Rates

Success rates (accept rate, open rate, conversion rate) are interesting to track over time. In addition you may want to split by e.g. Channel, or contrast the rates for different experimental setups in an A-B testing set-up.

Use "plot_daily_accept_rate" to plot accept rate per day to understand how accept rates changed over time. To define accept rate, enter the positive (here: Accepted) and negative (here: Rejected) behaviour in the function. use kwargs to customize the graph. If the time ticks on the x axis are too many, shrink them using 'shrinkTicks'. If data is missing in certain days, force the graph make gaps for the missing days by setting 'allTime':True. you can also define hue

In [None]:
ih.plots.success_rates_trend_line(
    by="Channel", query=pl.col.Channel.is_not_null() & (pl.col.Channel != "")
)

# Model Performance

Similar to Success Rates: typically viewed over time, likely split by channel, conditioned on variations, e.g. NB vs AGB models.

In [None]:
plot_data = (
    ih.aggregates._summary_interactions(every="1d", by="Channel")
    .filter(
        pl.col.Propensity.is_not_null()
        & pl.col.Interaction_Outcome_Engagement.is_not_null()
    )
    .group_by(["OutcomeTime", "Channel"])
    .agg(
        pl.map_groups(
            exprs=["Interaction_Outcome_Engagement", "Propensity"],
            function=lambda data: cdh_utils.auc_from_probs(data[0], data[1]),
            return_dtype=pl.Float64,
        ).alias("Performance")
    )
)
fig = px.line(
    plot_data.collect().sort(["OutcomeTime"]),
    y = "Performance",
    x="OutcomeTime",
    color="Channel",
    template="pega",
    title="Model Performance over Time"
)
fig

# Propensity Distribution

IH also contains information about the factors that determine the prioritization of the offers: lever values, propensities etc.

Here we show the distribution of the propensities of the offers made. 


In [None]:
import plotly.figure_factory as ff

channels = [
    c
    for c in ih.data.select(pl.col.Channel.unique().sort())
    .collect()["Channel"]
    .to_list()
    if c is not None and c != ""
    # if c == "Web"
]

plot_data = [
    ih.data.filter(pl.col.Channel == c)
    .select(["Propensity"])
    .collect()["Propensity"]
    .sample(fraction=0.1)
    .to_list()
    for c in channels
]

fig = ff.create_distplot(plot_data, group_labels=channels, show_hist=False)
fig.update_layout(title="Propensity Distribution")
fig

# Response Analysis

Time is one of the dimensions in IH. Here we take a look at how subsequent responses relate to the original decision. It shows, for example, how much time there typically is between the moment of decision and the click.

This type of analysis is usually part of attribution analysis when considering conversion modeling.


In [None]:
outcomes = [
    c
    for c in ih.data.select(pl.col.Outcome.unique().sort())
    .collect()["Outcome"]
    .to_list()
    if c is not None and c != ""
]
plot_data=ih.data.filter(pl.col.OutcomeTime.is_not_null()).group_by("InteractionID").agg(
    [pl.col.OutcomeTime.min().alias("Decision_Time")]+
    [pl.col.OutcomeTime.filter(pl.col.Outcome == o).max().alias(o) for o in outcomes],
).collect().unpivot(
    index=["InteractionID", "Decision_Time"],
    variable_name="Outcome",
    value_name="Time",
).with_columns(
    Duration = (pl.col.Time - pl.col.Decision_Time).dt.total_seconds()
).filter(pl.col.Duration > 0)
fig = px.box(
    plot_data,
    x="Duration",
    y="Outcome",
    color="Outcome",
    template="pega"
)
fig