In [None]:
import numpy as np

In [None]:
imgs_path='/home/maria/MITNeuralComputation/vit_embeddings/images'
path = '/home/maria/Documents/HuggingMouseData/MouseViTEmbeddings/google_vit-base-patch16-224_embeddings_logits.pkl'
with open(path, 'rb') as f:
    vit_dict = pickle.load(f)['natural_scenes']

# Convert to a matrix
# If vit_dict is {image_id: logits_vector}
embeddings = np.stack(list(vit_dict))  # shape: (N_images, n_classes)
print("Embeddings shape:", embeddings.shape)

# Compute softmax over classes
X = softmax(embeddings, axis=1)
X_clr = clr(X + 1e-12)

In [3]:
#!/usr/bin/env python3
"""
allen_image_vs_embedding_entropy.py

Compare entropy of Allen natural-scene images and their
ViT softmax→Aitchison-CLR embeddings.

Outputs:
  - Mean pixel-histogram entropy  (bits)
  - Mean PNG bits-per-pixel       (bits/pixel)
  - Gaussian & kNN differential entropy of embeddings
  - Fraction of Gaussian entropy captured by top-k PCs
"""

import os, io, pickle, math
import numpy as np
from PIL import Image
from scipy.special import softmax, digamma
from math import gamma
from sklearn.decomposition import PCA

# ---------------------------------------------------------
# Config paths
# ---------------------------------------------------------
imgs_path = '/home/maria/MITNeuralComputation/vit_embeddings/images'
emb_path  = '/home/maria/Documents/HuggingMouseData/MouseViTEmbeddings/google_vit-base-patch16-224_embeddings_logits.pkl'

# ---------------------------------------------------------
# Utilities
# ---------------------------------------------------------
def clr(p, eps=1e-12):
    p = np.maximum(p, eps)
    p /= p.sum(axis=1, keepdims=True)
    g = np.exp(np.mean(np.log(p), axis=1, keepdims=True))
    return np.log(p / g)

def shannon_entropy_gray(im_u8):
    hist = np.bincount(im_u8.ravel(), minlength=256).astype(float)
    p = hist / hist.sum()
    p = p[p > 0]
    return -(p * np.log2(p)).sum()

def png_bits_per_pixel(im_u8):
    with io.BytesIO() as buf:
        Image.fromarray(im_u8).save(buf, format="PNG", optimize=True)
        bits = len(buf.getvalue()) * 8
    h, w = im_u8.shape[:2]
    return bits / (h * w)

def gaussian_differential_entropy(X):
    """Gaussian diff entropy in bits."""
    Xc = X - X.mean(0, keepdims=True)
    n, d = Xc.shape
    S = np.cov(Xc, rowvar=False) + 1e-9 * np.eye(d)
    sign, logdet = np.linalg.slogdet(S)
    H_nats = 0.5 * (d * (1 + np.log(2*np.pi)) + logdet)
    return H_nats / np.log(2)

from scipy.special import digamma, gammaln

def knn_entropy(X, k=5):
    """
    Kozachenko–Leonenko kNN differential entropy (bits), numerically stable.
    """
    n, d = X.shape
    # pairwise distances
    D = np.linalg.norm(X[:, None, :] - X[None, :, :], axis=-1)
    idx = np.argsort(D, axis=1)
    kth = D[np.arange(n), idx[:, k]]

    # log-volume of unit d-ball
    log_c_d = (d / 2) * np.log(np.pi) - gammaln(d / 2 + 1)

    H_nats = digamma(n) - digamma(k) + log_c_d + d * np.mean(np.log(kth + 1e-12))
    return float(H_nats / np.log(2))


def pca_entropy_spectrum(X):
    Xc = X - X.mean(0, keepdims=True)
    eigs = np.linalg.eigvalsh(np.cov(Xc, rowvar=False))
    eigs = np.sort(np.maximum(eigs,1e-12))[::-1]
    logl = np.log(eigs)
    csum = np.cumsum(logl)
    total = logl.sum()
    return csum/total, eigs

# ---------------------------------------------------------
# 1) Image entropies
# ---------------------------------------------------------
exts = (".png",".jpg",".jpeg",".bmp",".tif",".tiff",".webp")
H_hist, H_bpp = [], []
for fn in os.listdir(imgs_path):
    if fn.lower().endswith(exts):
        im = Image.open(os.path.join(imgs_path,fn)).convert("L")
        im_u8 = np.array(im,dtype=np.uint8)
        H_hist.append(shannon_entropy_gray(im_u8))
        H_bpp.append(png_bits_per_pixel(im_u8))
print(f"[Images]  N={len(H_hist)}")
print(f"  Pixel entropy mean±sd: {np.mean(H_hist):.3f} ± {np.std(H_hist):.3f} bits")
print(f"  PNG bits/pixel mean±sd: {np.mean(H_bpp):.3f} ± {np.std(H_bpp):.3f}")

# ---------------------------------------------------------
# 2) Load ViT logits → softmax → CLR embeddings
# ---------------------------------------------------------
with open(emb_path,'rb') as f:
    vit_dict = pickle.load(f)['natural_scenes']

embeddings = np.stack(list(vit_dict))   # shape (N_images, n_classes)
print("Embeddings shape:", embeddings.shape)

X = softmax(embeddings, axis=1)
X_clr = clr(X)

# ---------------------------------------------------------
# 3) Embedding entropies
# ---------------------------------------------------------
H_gauss = gaussian_differential_entropy(X_clr)
H_knn   = knn_entropy(X_clr, k=5)
print(f"[CLR embeddings]")
print(f"  Gaussian differential entropy: {H_gauss:.3f} bits")
print(f"  kNN differential entropy (k=5): {H_knn:.3f} bits")

frac, eigs = pca_entropy_spectrum(X_clr)
for k in [8,16,32,64,128]:
    if k <= len(frac):
        print(f"  Top-{k:3d} PCs capture {frac[k-1]*100:6.2f}% of Gaussian entropy")

print("Done.")


[Images]  N=118
  Pixel entropy mean±sd: 7.534 ± 0.340 bits
  PNG bits/pixel mean±sd: 5.819 ± 0.713
Embeddings shape: (118, 1000)
[CLR embeddings]
  Gaussian differential entropy: -11053.434 bits
  kNN differential entropy (k=5): 2257.927 bits
  Top-  8 PCs capture  -0.14% of Gaussian entropy
  Top- 16 PCs capture  -0.24% of Gaussian entropy
  Top- 32 PCs capture  -0.39% of Gaussian entropy
  Top- 64 PCs capture  -0.58% of Gaussian entropy
  Top-128 PCs capture   0.69% of Gaussian entropy
Done.


In [4]:
from sklearn.decomposition import PCA
Xr = PCA(n_components=50, whiten=True, random_state=0).fit_transform(X_clr)

H_gauss_50 = gaussian_differential_entropy(Xr)
H_knn_50   = knn_entropy(Xr, k=5)
print(H_gauss_50, H_knn_50)

102.35478472442222 116.0582459385017
