# Existence of semantic manifolds

In [1]:
import numpy as np
from data.generate import main 

words = ['dog', 'cat', 'mammal', 'insect', 'bird']
embeddings_raw = main(['mammal'], model="openai")



  Referenced from: <6DFB383A-E1D9-3EC6-8A60-382AF4E3C226> /Users/matthieu/.pyenv/versions/3.10.0/lib/python3.10/site-packages/torchvision/image.so
  warn(
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/matthieu/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


2250
['Someone has a Narwhal', 'Someone has a Narwal', 'Someone has a Narwhale', 'Someone has a Monodon monoceros', 'Someone has a Papillon', 'Someone has a Brittany spaniel', 'Someone has a Rhesus', 'Someone has a Rhesus monkey', 'Someone has a Macaca mulatta', 'Someone has a Sand rat', 'Someone has a Meriones longifrons', 'Someone has a Springbok', 'Someone has a Springbuck', 'Someone has an Antidorcas marsupialis', 'Someone has an Antidorcas euchore', 'Someone has a Pomeranian', 'Someone has a Plantigrade mammal', 'Someone has a Plantigrade', 'Someone has a Sussex spaniel', 'Someone has a Red fox', 'Someone has a Vulpes fulva', 'Someone has a Staghound', 'Someone has a Cryptoprocta', 'Someone has a Genus cryptoprocta', 'Someone has a White wolf', 'Someone has an Arctic wolf', 'Someone has a Canis lupus tundrarum', 'Someone has a Poodle', 'Someone has a Poodle dog', 'Someone has a New world porcupine', 'Someone has a Pacific walrus', 'Someone has an Odobenus divergens', 'Someone has 

In [3]:
# Dimensionality reduction
from src.dimensionality_reduction.tangent_pca import TangentPCAEvaluator, TangentPCAProcessor

processor = TangentPCAProcessor(embeddings_raw,  n_components=400)
log_data, reduced_data, approx_data, pca = processor.process()

evaluator = TangentPCAEvaluator(processor.data, reduced_data, log_data, approx_data, pca)
evaluator.evaluate()



Explained Variance (Forward): 0.9541
Explained Variance (Backward): 3126.9126
Reconstruction Error (Frobenius norm): 39.0921
Reconstruction Error: 0.0000
Trustworthiness: 1.0000
Correlation between original and reduced distances: 0.9973
Continuity: 1.0000
Explained Variance Score: 0.7465


In [5]:
# Fitting the model 
from src.models.gaussian_mixture import run_gmm_model

gmm = run_gmm_model(reduced_data)



[I 2024-12-08 23:48:59,975] A new study created in memory with name: no-name-0f0d2cc9-7dc3-4a99-8d7f-00a1aed4cec8


  reg_covar = trial.suggest_loguniform("reg_covar", 1e-6, 1e-3)
[I 2024-12-08 23:49:00,214] Trial 0 finished with value: 4340534.308375702 and parameters: {'n_components': 5, 'covariance_type': 'diag', 'reg_covar': 1.9543869147730843e-05}. Best is trial 0 with value: 4340534.308375702.
  reg_covar = trial.suggest_loguniform("reg_covar", 1e-6, 1e-3)
[I 2024-12-08 23:49:00,440] Trial 1 finished with value: 3735605.4105289704 and parameters: {'n_components': 8, 'covariance_type': 'spherical', 'reg_covar': 6.015049013329767e-05}. Best is trial 1 with value: 3735605.4105289704.
  reg_covar = trial.suggest_loguniform("reg_covar", 1e-6, 1e-3)
[I 2024-12-08 23:49:00,710] Trial 2 finished with value: 4345450.077962407 and parameters: {'n_components': 6, 'covariance_type': 'diag', 'reg_covar': 9.850985172172154e-06}. Best is trial 1 with value: 3735605.4105289704.
  reg_covar = trial.suggest_loguniform("reg_covar", 1e-6, 1e-3)
[I 2024-12-08 23:49:05,280] Trial 3 finished with value: -753985.1984

Best Parameters: {'n_components': 10, 'covariance_type': 'full', 'reg_covar': 7.134852279001405e-05}
Best BIC: -856994.4222737243

--- Model Validation ---
Train Log-Likelihood: 2180928.65
Train BIC: 1679616.97
Train AIC: -2749839.30
Train MSE (Reconstruction): 0.000870
Train Perplexity: 0.000000

Test Log-Likelihood: 384019.92
Test BIC: 4156068.69
Test AIC: 843978.15
Test MSE (Reconstruction): 0.000927
Test Perplexity: 0.000000

Cross-Validation Results:
Mean Log-Likelihood: 375774.09 ± 9366.28
Mean BIC: 4172560.35 ± 18732.57
Silhouette Score: 0.06

Log-Likelihood Stats:
Mean: 1192.07, Std: 90.39

Cluster Stability Check: Cluster Counts Across Splits: [10, 10, 10]


In [7]:
# Assessment of the model 
import numpy as np

def classify_points_by_density(gmm, data, percentile=5):
    """
    Classify points based on density threshold.

    Args:
        gmm: Fitted Gaussian Mixture Model.
        data: Data points in the reduced space.
        percentile: Percentile threshold for defining high-density regions.

    Returns:
        inside_mask: Boolean mask for points inside the high-density region.
        outside_mask: Boolean mask for points outside the high-density region.
    """
    # Compute densities
    log_probs = gmm.score_samples(data)  # Log-likelihood for each point
    probs = np.exp(log_probs)  # Convert to probabilities
    
    # Determine density threshold
    threshold = np.percentile(probs, percentile)  # Threshold for high density
    inside_mask = probs > threshold  # Points inside the high-density region
    outside_mask = ~inside_mask  # Points outside the high-density region
    
    return inside_mask, outside_mask

def process_and_reproject_points(gmm, data, pca_mean, pca_components, tangent_mean, percentile=5):
    """
    Classify points by density and reproject to original space.

    Args:
        gmm: Fitted Gaussian Mixture Model.
        data: Data points in the reduced space.
        pca_mean: Mean vector from PCA.
        pca_components: PCA components for inverse projection.
        tangent_mean: Tangent space center for reprojection to hypersphere.
        percentile: Percentile threshold for defining high-density regions.

    Returns:
        inside_points: Points inside the high-density region in reduced space.
        outside_points: Points outside the high-density region in reduced space.
        original_inside_points: Points inside the high-density region in the original space.
    """
    # Classify points
    inside_mask, outside_mask = classify_points_by_density(gmm, data, percentile)
    
    # Separate inside and outside points
    inside_points = data[inside_mask]
    outside_points = data[outside_mask]
    
    print("Number of points inside:", len(inside_points))
    print("Number of points outside:", len(outside_points))
    
    # Reproject inside points to original space
    original_inside_points = reproject_points(inside_points, pca_mean, pca_components, tangent_mean)
    
    print("Reprojected Inside Points:", original_inside_points)
    
    return inside_points, outside_points, original_inside_points

n_samples = 200
threshold = 0.10
samples, _ = gmm.sample(n_samples=n_samples)

# Compute probabilities for samples
log_probs_samples = gmm.score_samples(samples)
probs_samples = np.exp(log_probs_samples)

# Filter samples by threshold
inside_samples = samples[probs_samples > threshold]
print("Number of samples inside:", len(inside_samples))

outside_samples = samples[probs_samples <= threshold]
print("Number of samples outside:", len(outside_samples))


AttributeError: 'NoneType' object has no attribute 'score_samples'

Now that we have a sample model, and it performs well, we can assess the existence of the manifold. We sample points inside the manifold and outside, and we check that the model can separate them. 

In [8]:
# Import the reverse model 
from src.reverse import initialize_model_and_encoder
enc, model = initialize_model_and_encoder()

enc.decode(model.generate(sampled_points, max_new_tokens=100, temperature=0.1))



ckpt.pt:   0%|          | 0.00/4.44G [00:00<?, ?B/s]

  checkpoint = torch.load(hf_hub_download(repo_id="MF-FOOM/wikivec2text", filename="ckpt.pt"), map_location=device)


RuntimeError: Attempting to deserialize object on a CUDA device but torch.cuda.is_available() is False. If you are running on a CPU-only machine, please use torch.load with map_location=torch.device('cpu') to map your storages to the CPU.

In [None]:
# Testing for a sample 



# Boundaries

In [None]:
We can now assess the boundaries of the manifold, which should travel along the manifold. 

We test different methods of moving along the manifold, and show that similar patterns are found for different words. 