In [1]:
# These lines are only for rendering in the docs, and are hidden through Jupyter tags
# Do not run if you're running the notebook seperately
# Hidden from doc by virtue of cell tags - in VSCode right-click on the bar to the left of this cell, edit cell tags, see metadata

import plotly.io as pio
import plotly as plotly

# plotly.offline.init_notebook_mode()
# pio.renderers.default = "vscode"

pio.renderers.default = "notebook_connected"

# Example IH Analysis

Interaction History (IH) is a rich source of data at the level of individual interactions from Pega DSM applications like CDH. It contains the time of the interaction, the channel, the actions/treatments, the customer ID and is used to track different types of outcomes (decisions, sends, opens, clicks, etc). It does **not** contain details of individual customers - only their ID's.

Interaction History is typically used to analyze customer behavior and optimize decision strategies. The following sections provide various example analyses that can be performed on IH data, including distribution analysis, response analysis, success rates, model performance, propensity distribution, and response time analysis.

Like most of PDSTools, it uses [plotly](https://plotly.com/python/) for visualization and [polars](https://docs.pola.rs/) (dataframe) but the purpose of this Notebook is more to serve example analyses than re-usable code, although of course we already provide some generic, re-usable functions. All of the analyses should be able to be replicated easily in other analytical BI environments - except perhaps the analysis of model performance / AUC.

This notebook uses mock data generated by PDS Tools. Replace it with your own actual IH data and modify the analyses as appropriate.

In [2]:
from pdstools import IH
import polars as pl
# ih = IH.from_ds_export(
#     "../../data/Data-pxStrategyResult_pxInteractionHistory_20210101T010000_GMT.zip"
# )
ih = IH.from_mock_data(n=1e5)

Preview of the raw IH data

In [3]:
ih.data.head().collect()

InteractionID,Channel,Issue,Group,Name,Treatment,ExperimentGroup,ModelTechnique,OutcomeTime,Direction,Outcome,BasePropensity,Propensity
str,str,str,str,str,str,str,str,datetime[μs],str,str,f64,f64
"""1000000000""","""Web""","""Risk""","""Savings""","""Savings_3""","""Savings_3_WebTreatment1""","""Conversion-Test""","""NaiveBayes""",2025-07-16 14:31:42.865343,"""Inbound""","""Impression""",0.028997,0.031488
"""1000000001""","""Web""","""Risk""","""Mortgages""","""Mortgages_6""","""Mortgages_6_WebTreatment2""","""Conversion-Test""","""GradientBoost""",2025-07-16 14:30:25.105343,"""Inbound""","""Impression""",0.009354,0.008834
"""1000000002""","""Email""","""Risk""","""Savings""","""Savings_4""","""Savings_4_EmailTreatment2""","""Conversion-Control""","""NaiveBayes""",2025-07-16 14:29:07.345343,"""Outbound""","""Pending""",0.013134,0.011579
"""1000000003""","""Email""","""Retention""","""Savings""","""Savings_6""","""Savings_6_EmailTreatment1""","""Conversion-Control""","""GradientBoost""",2025-07-16 14:27:49.585343,"""Outbound""","""Pending""",0.00688,0.00664
"""1000000004""","""Email""","""Risk""","""Investments""","""Investments_7""","""Investments_7_EmailTreatment2""","""Conversion-Test""","""NaiveBayes""",2025-07-16 14:26:31.825343,"""Outbound""","""Pending""",0.003905,0.003202


The same interaction can occur multiple times: once when the first decision is made, then later when responses are captured (accepted, sent, clicked, etc.). For some of the analyses we need to group the positive outcomes by interaction. This is how that data looks like:

In [4]:
ih.aggregates.summarize_by_interaction(by=["Channel"]).head().collect()

Channel,InteractionID,Interaction_Outcome_Engagement,Interaction_Outcome_Conversion,Interaction_Outcome_OpenRate,Propensity
str,str,bool,bool,bool,f64
"""Email""","""1000017223""",False,False,False,0.020036
"""Email""","""1000069313""",False,False,False,0.006585
"""Email""","""1000014088""",False,False,False,0.003665
"""Web""","""1000079534""",True,False,False,0.020034
"""Email""","""1000082437""",False,False,False,0.003951


## Distribution Analysis

A distribution of the offers (actions/treatments) is often the most obvious type of analysis. You can do an action distribution for specific outcomes (what is offered, what is accepted), view it conditionally (what got offered last month vs this month) - possibly with a delta view, or over time.

In [5]:
ih.plot.response_count_tree_map()

In [6]:
fig = ih.plot.action_distribution(
    query=pl.col.Outcome.is_in(["Clicked", "Accepted"]), 
    title="Distribution of Actions",
    color="Outcome",
)
# fig.update_layout(yaxis=dict(tickmode="linear")) # to show all names
fig.show()

## Response Analysis

A simple view of the responses over time shows how many responses are received per day (or any other period).

In [7]:
ih.plot.response_count(every="1d")

Which could be viewed per channel as well:

In [8]:
ih.plot.response_count(
    facet="Channel",
    query=pl.col.Channel != "",
)

## Success Rates

Success rates (accept rate, open rate, conversion rate) are interesting to track over time. In addition you may want to split by e.g. Channel, or contrast the rates for different experimental setups in an A-B testing set-up.

In [9]:
ih.plot.success_rate(
    facet="Channel", query=pl.col.Channel.is_not_null() & (pl.col.Channel != "")
)

## Model Performance

Similar to Success Rates: typically viewed over time, likely split by channel, conditioned on variations, e.g. NB vs AGB models.

In [10]:
ih.plot.model_performance_trend(by="Channel", every="1w")

## AGB vs NB analysis

There are different types of ADM models you can use in CDH. This analysis shows the model performance of the (classic) Naive Bayes models vs the new Gradient Boosting models. We split by channel as this often matters.

In [11]:
fig = ih.plot.model_performance_trend(
    by="ModelTechnique",
    facet="Channel",
    every="1w",
    title="Model Performance of Naive Bayes vs Gradient Boosting Models",
)
fig.update_layout(legend_title_text="Technique")
fig

# Propensity Analysis

IH also contains information about the factors that determine the prioritization of the offers: lever values, propensities etc.

## Propensity Distribution

Here we show the distribution of the propensities of the offers made. It's also a first example of a custom analysis not currently supported directly by the PDSTools library. You can see how we access the underlying IH data (**ih.data**), then aggregate and display it.


In [12]:
import plotly.figure_factory as ff

channels = [
    c
    for c in ih.data.select(pl.col.Channel.unique().sort())
    .collect()["Channel"]
    .to_list()
    if c is not None and c != ""
]

plot_data = [
    ih.data.filter(pl.col.Channel == c)
    .select(["Propensity"])
    .collect()["Propensity"]
    .sample(fraction=0.1)
    .to_list()
    for c in channels
]
fig = ff.create_distplot(plot_data, group_labels=channels, show_hist=False)
fig.update_layout(
    title="Propensity Distribution",
    yaxis=dict(showticklabels=False),
    xaxis=dict(title="Propensity", tickformat=".0%"),
    legend_title_text="Channel",
    template="pega",
)
fig

## Propensity Calibrarion



We can verify the accurateness of the propensities generated by Pega vs the actual click-through rates by looking at the click through rates in interaction history data. Although there currently is no direct IH plot to do this, the underlying aggregation functions are generic enough to support this.

The plot shows how propensities calibrate against the click through rates in IH.

In [13]:
import plotly.express as px
px.bar(
    # We simply use qcut to get the equal volume bins. The labels are very long/not very readable. To solve
    # for that we could do a qcut on the raw data first, programatically set the labels etc.
    ih.aggregates.summary_success_rates(
        by=[pl.col("Propensity").qcut(10).alias("PropensityBin"), "Channel", "Direction"]
    )
    .collect()
    .unpivot(
        on=["SuccessRate_Engagement"],
        index=["PropensityBin", "Channel", "Direction"],
        variable_name="KPI",
        value_name="CTR",
    ).with_columns(
        Channel = pl.concat_str("Channel","Direction",separator="/")
    ),
    x="PropensityBin",
    y="CTR",
    facet_row="Channel",
    template="pega",
    title="Propensity Calibration"
).update_xaxes(title="").update_yaxes(tickformat=".2%")

## Response Time Analysis

Time is one of the dimensions in IH. Here we take a look at how subsequent responses relate to the original decision. It shows, for example, how much time there typically is between the moment of decision and the click.

This type of analysis is usually part of attribution analysis when considering conversion modeling.


In [14]:
import plotly.express as px

outcomes = [
    objective
    for objective in ih.data.select(pl.col.Outcome.unique().sort())
    .collect()["Outcome"]
    .to_list()
    if objective is not None and objective != ""
]
plot_data = (
    ih.data.filter(pl.col.OutcomeTime.is_not_null())
    .group_by("InteractionID")
    .agg(
        [pl.col.OutcomeTime.min().alias("Decision_Time")]
        + [
            pl.col.OutcomeTime.filter(pl.col.Outcome == o).max().alias(o)
            for o in outcomes
        ],
    )
    .collect()
    .unpivot(
        index=["InteractionID", "Decision_Time"],
        variable_name="Outcome",
        value_name="Time",
    )
    .with_columns(Duration=(pl.col.Time - pl.col.Decision_Time).dt.total_seconds())
    .filter(pl.col.Duration > 0)
)

ordered_outcomes = (
    plot_data.group_by("Outcome")
    .agg(Duration=pl.col("Duration").median())
    .sort("Duration")["Outcome"]
    .to_list()
)

fig = px.box(
    plot_data,
    x="Duration",
    y="Outcome",
    color="Outcome",
    template="pega",
    category_orders={"Outcome": ordered_outcomes},
    points=False,
    title="Duration of Responses",
    log_x=True,
)
fig.update_layout(
    xaxis_title="Duration (seconds) with logarithmic scale", yaxis_title=""
)
fig

# Pattern Analysis

This method uncovers patterns in customer behavior by analyzing the sequences of actions that lead to outcomes like conversions. 
By calculating Pointwise Mutual Information (PMI), we highlight strong associations between actions in customer journeys.

In [15]:
customer_sequences, customer_outcomes, count_actions, count_sequences = ih.get_sequences(
    positive_outcome_label="Conversion", 
    outcome_column="Outcome", 
    level ="Name", 
    customerid_column="InteractionID"
)

sequences = ih.calculate_pmi(count_actions, count_sequences)

sequences_df = ih.pmi_overview(sequences, count_sequences, customer_sequences, customer_outcomes)

sequences_df.head()

Sequence,Length,Avg PMI,Frequency,Unique freq,Score
list[str],i64,f64,i64,i64,f64
"[""Savings_4"", ""Savings_4""]",2,5.594,23,23,17.541
"[""Insurance_3"", ""Insurance_3""]",2,4.784,39,39,17.525
"[""Insurance_5"", ""Insurance_5""]",2,5.296,27,27,17.455
"[""Savings_5"", ""Savings_5""]",2,5.417,25,25,17.435
"[""Mortgages_1"", ""Mortgages_1""]",2,5.417,25,25,17.435
