In [None]:
# These lines are only for rendering in the docs, and are hidden through Jupyter tags
# Do not run if you're running the notebook seperately
# Hidden from doc by virtue of cell tags - in VSCode right-click on the bar to the left of this cell, edit cell tags, see metadata

import plotly.io as pio

pio.renderers.default = "notebook_connected"

In [None]:
from pdstools import IH

# from pdstools.utils import cdh_utils

import polars as pl
import plotly as plotly


# plotly.offline.init_notebook_mode()
# pio.renderers.default = "vscode"

# Example IH Analysis

Interaction History (IH) is a rich source of data at the level of individual interactions from Pega DSM applications like CDH. It contains the time of the interaction, the channel, the actions/treatments, the customer ID and is used to track different types of outcomes (decisions, sends, opens, clicks, etc). It does **not** contain details of individual customers - only their ID's.

Interaction History is typically used to analyze customer behavior and optimize decision strategies. The following sections provide various example analyses that can be performed on IH data, including distribution analysis, response analysis, success rates, model performance, propensity distribution, and response time analysis.

Like most of PDSTools, it uses [plotly](https://plotly.com/python/) for visualization and [polars](https://docs.pola.rs/) (dataframe) but the purpose of this Notebook is more to serve example analyses than re-usable code, although of course we do try to provide some generic, re-usable functions. All of the analyses should be able to be replicated easily in other analytical BI environments - except perhaps the analysis of model performance / AUC.

This notebook uses sample data shipped with PDStools. Replace it with your own actual IH data and modify the analyses as appropriate.

In [None]:
# ih = IH.from_ds_export(
#     "../../data/Data-pxStrategyResult_pxInteractionHistory_20210101T010000_GMT.zip"
# )
ih = IH.from_mock_data(n=1e5)

Preview of the raw IH data

In [None]:
ih.data.head().collect()

The same interaction can occur multiple times: once when the first decision is made, then later when responses are captured (accepted, sent, clicked, etc.). For some of the analyses we need to group by interaction. This is how that data looks like:

In [None]:
ih.aggregates._summary_interactions(by=["Channel"]).head().collect()

## Distribution Analysis

A distribution of the offers (actions/treatments) is often the most obvious type of analysis. You can do an action distribution for specific outcomes (what is offered, what is accepted), view it conditionally (what got offered last month vs this month) - possibly with a delta view, or over time.

In [None]:
ih.plot.response_count_tree_map()

In [None]:
fig = ih.plot.action_distribution(
    query=pl.col.Outcome.is_in(["Clicked", "Accepted"]), 
    title="Distribution of Actions",
    color="Outcome",
)
# fig.update_layout(yaxis=dict(tickmode="linear")) # to show all names
fig

## Response Analysis

A simple view of the responses over time shows how many responses are received per day (or any other period).

In [None]:
ih.plot.response_count(every="1d")

Which could be viewed per channel as well:

In [None]:
ih.plot.response_count(
    facet="Channel",
    query=pl.col.Channel != "",
)

## Success Rates

Success rates (accept rate, open rate, conversion rate) are interesting to track over time. In addition you may want to split by e.g. Channel, or contrast the rates for different experimental setups in an A-B testing set-up.

In [None]:
ih.plot.success_rate(
    facet="Channel", query=pl.col.Channel.is_not_null() & (pl.col.Channel != "")
)

## Model Performance

Similar to Success Rates: typically viewed over time, likely split by channel, conditioned on variations, e.g. NB vs AGB models.

In [None]:
ih.plot.model_performance_trend(by="Channel", every="1w")

## AGB vs NB analysis

There are different types of ADM models you can use in CDH. This analysis shows the model performance of the (classic) Naive Bayes models vs the new Gradient Boosting models. We split by channel as this often matters.

In [None]:
fig = ih.plot.model_performance_trend(
    by="ModelTechnique",
    facet="Channel",
    every="1w",
    title="Model Performance of Naive Bayes vs Gradient Boosting Models",
)
fig.update_layout(legend_title_text="Technique")
fig

## Propensity Distribution

IH also contains information about the factors that determine the prioritization of the offers: lever values, propensities etc.

Here we show the distribution of the propensities of the offers made. It's also a first example of a custom analysis not currently supported directly by the PDSTools library. You can see how we access the underlying IH data (**ih.data**), then aggregate and display it.


In [None]:
import plotly.figure_factory as ff

channels = [
    c
    for c in ih.data.select(pl.col.Channel.unique().sort())
    .collect()["Channel"]
    .to_list()
    if c is not None and c != ""
]

plot_data = [
    ih.data.filter(pl.col.Channel == c)
    .select(["Propensity"])
    .collect()["Propensity"]
    .sample(fraction=0.1)
    .to_list()
    for c in channels
]
fig = ff.create_distplot(plot_data, group_labels=channels, show_hist=False)
fig.update_layout(
    title="Propensity Distribution",
    yaxis=dict(showticklabels=False),
    xaxis=dict(title="Propensity", tickformat=".0%"),
    legend_title_text="Channel",
    template="pega",
)
fig

## Response Time Analysis

Time is one of the dimensions in IH. Here we take a look at how subsequent responses relate to the original decision. It shows, for example, how much time there typically is between the moment of decision and the click.

This type of analysis is usually part of attribution analysis when considering conversion modeling.


In [None]:
import plotly.express as px

outcomes = [
    objective
    for objective in ih.data.select(pl.col.Outcome.unique().sort())
    .collect()["Outcome"]
    .to_list()
    if objective is not None and objective != ""
]
plot_data = (
    ih.data.filter(pl.col.OutcomeTime.is_not_null())
    .group_by("InteractionID")
    .agg(
        [pl.col.OutcomeTime.min().alias("Decision_Time")]
        + [
            pl.col.OutcomeTime.filter(pl.col.Outcome == o).max().alias(o)
            for o in outcomes
        ],
    )
    .collect()
    .unpivot(
        index=["InteractionID", "Decision_Time"],
        variable_name="Outcome",
        value_name="Time",
    )
    .with_columns(Duration=(pl.col.Time - pl.col.Decision_Time).dt.total_seconds())
    .filter(pl.col.Duration > 0)
)

ordered_outcomes = (
    plot_data.group_by("Outcome")
    .agg(Duration=pl.col("Duration").median())
    .sort("Duration")["Outcome"]
    .to_list()
)

fig = px.box(
    plot_data,
    x="Duration",
    y="Outcome",
    color="Outcome",
    template="pega",
    category_orders={"Outcome": ordered_outcomes},
    points=False,
    title="Duration of Responses",
    log_x=True,
)
fig.update_layout(
    xaxis_title="Duration (seconds) with logarithmic scale", yaxis_title=""
)
fig