# Exploring the Recurrence Relations for CLAM Clustering

For a dataset of size `n` and a branching factor of `b`, the number of clusters in the tree `T(n)` is given by the following recurrence relations:

$T(1) = 1, T(2) = 1$, the leaf clusters

$T(n) = n - 1, \ for \ 3 <= n <= b + 1$, the parents of leaf clusters

$T(1 + a + b * n) = 1 + a * T(n + 1) + (b - i) * T(n), \ for \ n > b + 1 \ and \ 0 <= a < b$

In [1]:
import pandas
import plotly.graph_objects as go
from plotly.subplots import make_subplots

In [2]:
# more imports here


In [3]:
def compute_memo(min_n: int, max_n: int, b: int) -> pandas.DataFrame:
    """Compute the memoization table for our recurrence relation.

    For a dataset of size n and a branching factor of b, the number of clusters in the tree T(n)
    is given by the following recurrence relations:
      - T(1) = 1 and T(2) = 1, the leaf clusters
      - T(n) = n - 1 for 3 <= n <= b + 1, parent cluster whose children are all leaves
      - T(1 + a + b * n) = 1 + a * T(n + 1) + (b - a) * T(n) for n >= b + 2 and 0 <= a < b

    Args:
        min_n: The minimum value of n to compute. This is to reduce noise in the output.
        max_n: The maximum value of n to compute.
        b: The branching factor.

    Returns:
        A pandas DataFrame with columns "n", "T(n)", and "T(n)/n".
    """
    memo = [0] * (max_n + 1)
    memo[0] = 1
    memo[1] = 1
    memo[2] = 1

    for n in range(3, b + 2):
        memo[n] = n - 1

    for n in range(b + 2, max_n + 1):
        q = (n - 1) // b
        a = (n - 1) % b
        memo[n] = 1 + a * memo[q + 1] + (b - a) * memo[q]

    ratios = [(n, t, t / n) for n, t in enumerate(memo[1 + min_n:], start=min_n)]
    (n, t, r) = tuple(zip(*ratios))

    return pandas.DataFrame({"n": n, "T(n)": t, "T(n)/n": r})

In [4]:
# pyright: reportUnknownMemberType=false

def make_plot(min_n: int, max_n: int, data: list[tuple[int, pandas.DataFrame]]) -> go.Figure:
    """Create the plots for the recurrence relations, using the same color for each branching factor and combining the legends."""
    import plotly.colors as pc

    # Assign a color for each branching factor
    bs = [b for b, _ in data]
    palette = pc.qualitative.Set1 if len(bs) <= len(pc.qualitative.Set1) else pc.qualitative.Plotly
    color_map = {b: palette[i % len(palette)] for i, b in enumerate(bs)}

    fig = make_subplots(
        rows=1, cols=2,
        subplot_titles=["Number of clusters in trees T(n) with various branching factors", "Ratio of clusters to data points T(n)/n with various branching factors"],
        column_widths=[0.33, 0.67],
        horizontal_spacing=0.05
    )
    fig.add_shape(
        type="line",
        x0=min_n,
        y0=min_n,
        x1=max_n,
        y1=max_n,
        line=dict(color="Black", dash="dash"),
        row=1, col=1
    )

    for b, tree_size_df in data:
        color = color_map[b]

        # First plot: T(n) vs n
        fig.add_trace(
            go.Scatter(x=tree_size_df["n"], y=tree_size_df["T(n)"], mode="lines", name=f"b={b}", legendgroup=f"b={b}", line=dict(color=color)),
            row=1, col=1
        )

        # Second plot: T(n)/n vs n
        fig.add_trace(
            go.Scatter(x=tree_size_df["n"], y=tree_size_df["T(n)/n"], mode="lines", name=f"b={b}", legendgroup=f"b={b}", line=dict(color=color), showlegend=False),
            row=1, col=2
        )

    fig.update_xaxes(type="log", title_text="n (log scale)", row=1, col=1)
    fig.update_yaxes(type="log", title_text="T(n) (log scale)", row=1, col=1)
    fig.update_xaxes(type="log", title_text="n (log scale)", row=1, col=2)
    fig.update_yaxes(title_text="T(n)/n", row=1, col=2)

    # Layout adjustments
    fig.update_layout(width=2400, height=800, showlegend=True)
    return fig

In [5]:
# More helper functions here


In [6]:
min_n = 10
max_n = 100_000
b = 2

tree_size_df = compute_memo(min_n, max_n, b)

In [7]:
fig = make_plot(min_n, max_n, [(b, tree_size_df)])
fig.show()

In [None]:
bs = list(range(2, 11))
data = [(b, compute_memo(min_n, max_n, b)) for b in bs]

In [None]:
fig = make_plot(min_n, max_n, data)
fig.show()

In [None]:
min_n = 10
max_n = 100_000
bs = list(range(2, 65))
data = [(b, compute_memo(min_n, max_n, b)) for b in bs]

In [None]:
fig = make_plot(min_n, max_n, data)
fig.show()