In [None]:
import time
import numpy as np
import polars as pl
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
import umap
import pandas as pd

# For visualizations
from mpl_toolkits.mplot3d import Axes3D
import seaborn as sns

In [None]:
# Path to the embeddings file
embeddings_path = "/Users/ma9o/Desktop/serengpty/apps/data-pipeline/data/dagster/conversations_embeddings/cm0i27jdj0000aqpa73ghpcxf.snappy"

# Load the embeddings file
df = pl.read_parquet(embeddings_path)

# Extract embeddings and convert to numpy array
embeddings_list = df["embedding"].to_list()
embeddings_array = np.array(embeddings_list, dtype=np.float32)

# Check dimensionality
print(f"Number of embeddings: {embeddings_array.shape[0]}")
print(f"Embedding dimension: {embeddings_array.shape[1]}")

In [None]:
# Define components for dimensionality reduction
target_dims = [2, 3, 10, 50, 100]
results = []

# Benchmark PCA
for dim in target_dims:
    start_time = time.time()
    pca = PCA(n_components=dim)
    pca_result = pca.fit_transform(embeddings_array)
    end_time = time.time()
    
    execution_time = end_time - start_time
    results.append({
        'method': 'PCA',
        'dimensions': dim,
        'time': execution_time,
        'explained_variance': sum(pca.explained_variance_ratio_)
    })
    
    print(f"PCA {dim}D: {execution_time:.4f} seconds - Explained variance: {sum(pca.explained_variance_ratio_):.4f}")

# Benchmark UMAP with different parameters
for dim in target_dims:
    start_time = time.time()
    reducer = umap.UMAP(n_components=dim, random_state=42)
    umap_result = reducer.fit_transform(embeddings_array)
    end_time = time.time()
    
    execution_time = end_time - start_time
    results.append({
        'method': 'UMAP',
        'dimensions': dim,
        'time': execution_time,
        'explained_variance': None  # UMAP doesn't provide explained variance
    })
    
    print(f"UMAP {dim}D: {execution_time:.4f} seconds")

In [None]:
# Create a DataFrame from the results
results_df = pd.DataFrame(results)

# Plot execution times
plt.figure(figsize=(12, 6))
sns.barplot(x='dimensions', y='time', hue='method', data=results_df)
plt.title('Execution Time Comparison: PCA vs UMAP')
plt.xlabel('Output Dimensions')
plt.ylabel('Time (seconds)')
plt.yscale('log')  # Log scale for better visibility if times vary greatly
plt.grid(True, alpha=0.3)
plt.legend(title='Method')
plt.show()

In [None]:
# Get 2D embeddings for visualization
pca_2d = PCA(n_components=2).fit_transform(embeddings_array)
umap_2d = umap.UMAP(n_components=2, random_state=42).fit_transform(embeddings_array)

# Create sample labels for visualization (optional)
# If there are categories in your data, you can use them instead
sample_size = min(100, embeddings_array.shape[0])  # Use a subset for better visibility
indices = np.random.choice(embeddings_array.shape[0], sample_size, replace=False)

# Plot PCA 2D
plt.figure(figsize=(12, 5))
plt.subplot(1, 2, 1)
plt.scatter(pca_2d[indices, 0], pca_2d[indices, 1], alpha=0.7)
plt.title('PCA - 2D Projection')
plt.xlabel('Component 1')
plt.ylabel('Component 2')
plt.grid(True, alpha=0.3)

# Plot UMAP 2D
plt.subplot(1, 2, 2)
plt.scatter(umap_2d[indices, 0], umap_2d[indices, 1], alpha=0.7)
plt.title('UMAP - 2D Projection')
plt.xlabel('Component 1')
plt.ylabel('Component 2')
plt.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

In [None]:
# Create a summary table
print("Performance Summary:\n")
print(pd.DataFrame(results))

# Calculate average speedup
pca_times = results_df[results_df['method'] == 'PCA']['time'].values
umap_times = results_df[results_df['method'] == 'UMAP']['time'].values
speedup_factors = umap_times / pca_times

print("\nUMAP/PCA time ratio by dimensions:")
for dim, factor in zip(target_dims, speedup_factors):
    print(f"{dim}D: UMAP is {factor:.2f}x {'slower' if factor > 1 else 'faster'} than PCA")

# PCA explained variance analysis
pca_variance = results_df[results_df['method'] == 'PCA'][['dimensions', 'explained_variance']]
print("\nPCA Explained Variance:")
print(pca_variance)

# Conclusion
print("\nConclusion:")
print("1. PCA is generally faster than UMAP for this dataset.")
print("2. PCA's explained variance increases with more dimensions.")
print("3. UMAP tends to preserve local structure better but at a higher computational cost.")
print("4. The choice between PCA and UMAP depends on whether speed (PCA) or structure preservation (UMAP) is more important.")