# Exploring the Recurrence Relations for CLAM Clustering

For a dataset of size `n` and a branching factor of `k`, the number of clusters in the tree `R(n)` is given by the following recurrence relations:

$R(1) = 1, R(2) = 1$, the leaf clusters

$R(n) = n - 1, \ for \ 3 < n <= k + 1$, the parents of leaf clusters

$R(1 + i + k*n) = 1 + i*R(n + 1) + (k - i)*R(n), \ for \ n > k + 1 \ and \ 0 <= i < k$

In [12]:
import pathlib
import re

import pandas
import plotly.express as px
from plotly.graph_objs import Figure

In [13]:
# more imports here


In [None]:
# pyright: reportUnknownMemberType=false

def load_data(file_path: pathlib.Path) -> pandas.DataFrame:
    """Load data from a parquet file."""
    return pandas.read_parquet(file_path)


def make_scatter_plots(
    memo_df: pandas.DataFrame,
    ratios_df: pandas.DataFrame,
) -> tuple[Figure, Figure]:
    """Create scatter plots for memos and ratios using Plotly."""
    # Melt for easier plotting
    memo_df.reset_index(inplace=True)
    ratios_df.reset_index(inplace=True)
    memo_melt = memo_df.melt(id_vars=["index"], var_name="k", value_name="R(n, k)")
    ratios_melt = ratios_df.melt(id_vars=["index"], var_name="k", value_name="R(n, k) / n")

    memos_fig = px.scatter(
        memo_melt,
        x=memo_melt.columns[0],
        y="R(n, k)",
        color="k",
        title="Size of tree R(n, k) vs n",
        log_x=True,
        log_y=True,
        opacity=0.4,
        height=600,
        width=900,
    )
    memos_fig.update_layout(xaxis_title="n", yaxis_title="R(n, k)", legend_title="k")

    ratios_fig = px.scatter(
        ratios_melt,
        x=ratios_melt.columns[0],
        y="R(n, k) / n",
        color="k",
        title="Ratio R(n, k) / n vs n",
        log_x=True,
        opacity=0.4,
        height=600,
        width=900,
    )
    ratios_fig.update_layout(xaxis_title="n", yaxis_title="R(n, k) / n", legend_title="k")

    return memos_fig, ratios_fig

In [15]:
# More helper functions here


In [16]:
# Path for the data files
data_dir = pathlib.Path().cwd().parent / "data" / "recurrence_relations"
data_dir

PosixPath('/home/nishaq/Documents/research/clam/papers/cakes/data/recurrence_relations')

In [None]:
memo_regex = r"memos_(?P<max_n>\d+)_(?P<max_k>\d+)\.parquet\.gzip"
ratios_regex = r"ratios_(?P<max_n>\d+)_(?P<max_k>\d+)\.parquet\.gzip"

all_files = list(data_dir.glob("*.parquet.gzip"))
if not all_files:
    raise ValueError(f"No parquet files found in {data_dir}")

memos_files: dict[tuple[int, int], str] = {}
ratios_files: dict[tuple[int, int], str] = {}

for f in all_files:
    match_memo = re.match(memo_regex, f.name)
    if match_memo:
        max_n = int(match_memo.group("max_n"))
        max_k = int(match_memo.group("max_k"))
        memos_files[(max_n, max_k)] = f.name
    match_ratios = re.match(ratios_regex, f.name)
    if match_ratios:
        max_n = int(match_ratios.group("max_n"))
        max_k = int(match_ratios.group("max_k"))
        ratios_files[(max_n, max_k)] = f.name

matched_keys = set(memos_files.keys()) & set(ratios_files.keys())
matched_files = {k: (memos_files[k], ratios_files[k]) for k in matched_keys}

print(f"Found {len(matched_files)} matched memo and ratio files.")
print("Matched keys (max_n, max_k):", matched_files.keys())

In [None]:
# Let's take a look at the smallest dataset
selected_key = (1000000, 32)
memo_file, ratios_file = matched_files[selected_key]
memo_path = data_dir / memo_file
ratios_path = data_dir / ratios_file
memo_df = load_data(memo_path)
ratios_df = load_data(ratios_path)

In [None]:
memos_fig, ratios_fig = make_scatter_plots(memo_df, ratios_df)

In [None]:
ratios_fig.show()