In [27]:
import os
from urllib.request import urlopen, urlretrieve
import torch
import numpy as np
import certifi

from sklearn.metrics import mutual_info_score
os.environ["SSL_CERT_FILE"] = certifi.where()

def avg_pairwise_mi(X, sample=3000, max_pairs=2000, seed=0):
    """Mean pairwise mutual information (bits) for binary data X ∈ {0,1}^{N×D}."""
    n, d = X.shape
    rng = np.random.default_rng(seed)
    # draw unique pairs safely
    all_pairs = np.array([(i, j) for i in range(d) for j in range(i+1, d)])
    if len(all_pairs) > max_pairs:
        idx = rng.choice(len(all_pairs), size=max_pairs, replace=False)
        pairs = all_pairs[idx]
    else:
        pairs = all_pairs
    m = min(sample, n)
    Xs = X[rng.choice(n, m, replace=False)]
    mi_vals = [mutual_info_score(Xs[:, i], Xs[:, j]) for i, j in pairs]
    return float(np.mean(mi_vals)) if mi_vals else 0.0

def get_data_loaders(
    batch_size=None,
    data_dir="./data",
    dataset="nltcs",
    conditional=False,
    max_samples=None,
):
    """Create MNIST data loaders with binary thresholding."""

    URI = "https://raw.githubusercontent.com/UCLA-StarAI/Density-Estimation-Datasets/refs/heads/master/datasets/"

    URLS = {
        "nltcs": {
            "train": URI + "nltcs/nltcs.train.data",
            "val": URI + "nltcs/nltcs.test.data",
        },
        "msnbc": {
            "train": URI + "msnbc/msnbc.train.data",
            "val": URI + "msnbc/msnbc.test.data",
        },
        "kdd": {
            "train": URI + "kdd/kdd.train.data",
            "val": URI + "kdd/kdd.test.data",
        },
        "plants": {
            "train": URI + "plants/plants.train.data",
            "val": URI + "plants/plants.test.data",
        },
        "baudio": {
            "train": URI + "baudio/baudio.train.data",
            "val": URI + "baudio/baudio.test.data",
        },
        "jester": {
            "train": URI + "jester/jester.train.data",
            "val": URI + "jester/jester.test.data",
        },
        "bnetflix": {
            "train": URI + "bnetflix/bnetflix.train.data",
            "val": URI + "bnetflix/bnetflix.test.data",
        },
        "accidents": {
            "train": URI + "accidents/accidents.train.data",
            "val": URI + "accidents/accidents.test.data",
        },
        "pumsb_star": {
            "train": URI + "pumsb_star/pumsb_star.train.data",
            "val": URI + "pumsb_star/pumsb_star.test.data",
        },
        "dna": {
            "train": URI + "dna/dna.train.data",
            "val": URI + "dna/dna.test.data",
        },
        "kosarek": {
            "train": URI + "kosarek/kosarek.train.data",
            "val": URI + "kosarek/kosarek.test.data",
        },
        "msweb": {
            "train": URI + "msweb/msweb.train.data",
            "val": URI + "msweb/msweb.test.data",
        },
        "book": {
            "train": URI + "book/book.train.data",
            "val": URI + "book/book.test.data",
        },
    }

    train_path = os.path.join(data_dir, dataset, f"{dataset}.train.data")
    val_path = os.path.join(data_dir, dataset, f"{dataset}.test.data")
    os.makedirs(os.path.join(data_dir, dataset), exist_ok=True)

    # Download if missing
    if not os.path.exists(train_path):
        urlretrieve(URLS[dataset]["train"], train_path)
    if not os.path.exists(val_path):
        urlretrieve(URLS[dataset]["val"], val_path)

    with urlopen(URLS[dataset]["train"]) as f:
        x_train = np.loadtxt(f, dtype=int, delimiter=",")

    with urlopen(URLS[dataset]["val"]) as f:
        x_val = np.loadtxt(f, dtype=int, delimiter=",")

    x_train = torch.from_numpy(x_train)
    x_val = torch.from_numpy(x_val)

    if max_samples is not None:
        x_train = x_train[:max_samples]

    D = x_train.shape[1]
    cols = torch.randperm(D)
    if conditional:
        x_train, y_train = x_train[:, cols[: D // 2]], x_train[:, cols[D // 2 :]]
        x_val, y_val = x_val[:, cols[: D // 2]], x_val[:, cols[D // 2 :]]
    else:
        y_train = x_train.clone()
        y_val = x_val.clone()
        x_train = torch.ones(x_train.shape[0], 1, device=x_train.device).float()
        x_val = torch.ones(x_val.shape[0], 1, device=x_val.device).float()

    train_set = torch.utils.data.TensorDataset(x_train, y_train)
    val_set = torch.utils.data.TensorDataset(x_val, y_val)

    if batch_size is None:
        batch_size = len(train_set)

    train_loader = torch.utils.data.DataLoader(
        train_set, batch_size=batch_size, shuffle=True
    )
    val_loader = torch.utils.data.DataLoader(
        val_set, batch_size=batch_size, shuffle=False
    )
    return train_loader, val_loader


In [37]:
import pandas as pd
rows = []
for dataset in ["nltcs", "msnbc", "kdd", "plants", "baudio", "jester", "bnetflix", "accidents", "pumsb_star", "dna", "kosarek", "msweb", "book"]:
    train_dl, test_dl = get_data_loaders(dataset=dataset, conditional=False, max_samples=1000)
    _, x_train = next(iter(train_dl))
    sparsity = (x_train == 0).sum() / x_train.numel()
    avg_mi = avg_pairwise_mi(x_train)
    rows.append({"dataset": dataset, "sparsity": sparsity.item(), "avg_mi": avg_mi})


In [None]:
df = pd.DataFrame(rows)
df.sort_values("avg_mi")

Unnamed: 0,dataset,sparsity,avg_mi
11,msweb,0.989742,0.000136
10,kosarek,0.979116,0.001334
9,dna,0.746256,0.001662
12,book,0.983548,0.001904
2,kdd,0.989266,0.002396
1,msnbc,0.834412,0.003472
7,accidents,0.70936,0.005766
6,bnetflix,0.46148,0.006361
4,baudio,0.80259,0.011658
5,jester,0.37924,0.016603
