In [11]:
import numpy as np
import plotly.express as px
import pandas as pd
from sklearn.decomposition import PCA
from sklearn.cluster import SpectralClustering

# Load embeddings
output_path = "/home/maria/Downloads/scotus_embeddings.npy"
#output_path="/home/maria/Downloads/scotus_embeddings_small.npy"
embeddings = np.load(output_path)

# Reduce dimensionality with PCA
pca = PCA(n_components=3)
pca_result_ = pca.fit_transform(embeddings)[:5000]  # Taking first 2000 points for better visualization

# Apply Spectral Clustering
num_clusters = 3 # You can change this based on how many clusters you expect
clustering = SpectralClustering(n_clusters=num_clusters, affinity='rbf', assign_labels='cluster_qr', random_state=42)
cluster_labels = clustering.fit_predict(pca_result_)

# Convert to DataFrame
df = pd.DataFrame(pca_result_[:,:3], columns=["PC1", "PC2", "PC3"])
df["Cluster"] = cluster_labels.astype(str)  # Convert to string for categorical coloring

# Plot interactive 3D scatter plot
fig = px.scatter_3d(df, x="PC1", y="PC2", z="PC3", opacity=0.6, 
                     title="Interactive 3D PCA Visualization of SCOTUS Embeddings (Colored by Clusters)",
                     color=df["Cluster"])

fig.update_traces(marker=dict(size=3, line=dict(width=0.5, color="black")))
fig.show()


In [27]:
import numpy as np
import plotly.express as px
import pandas as pd
from sklearn.decomposition import PCA
from sklearn.cluster import AffinityPropagation

# Load embeddings
output_path = "/home/maria/Downloads/scotus_embeddings.npy"
embeddings = np.load(output_path)

# Reduce dimensionality with PCA
pca = PCA(n_components=3)
pca_result_ = pca.fit_transform(embeddings)[:2000]  # Taking first 2000 points for better visualization

# Perform Affinity Propagation clustering
affinity_propagation = AffinityPropagation(damping=0.9, preference=-50, random_state=42)
cluster_labels = affinity_propagation.fit_predict(pca_result_)

# Convert to DataFrame
df = pd.DataFrame(pca_result_, columns=["PC1", "PC2", "PC3"])
df["Cluster"] = cluster_labels.astype(str)  # Convert to string for categorical coloring

# Plot interactive 3D scatter plot
fig = px.scatter_3d(df, x="PC1", y="PC2", z="PC3", opacity=0.6, 
                     title="Interactive 3D PCA Visualization of SCOTUS Embeddings (Affinity Propagation Clustering)",
                     color=df["Cluster"])

fig.update_traces(marker=dict(size=3, line=dict(width=0.5, color="black")))
fig.show()


In [7]:
import numpy as np
import plotly.express as px
import pandas as pd
from sklearn.decomposition import PCA
from scipy.cluster.hierarchy import linkage, fcluster

# Load embeddings
output_path = "/home/maria/Downloads/scotus_embeddings.npy"
embeddings = np.load(output_path)

# Reduce dimensionality with PCA
pca = PCA(n_components=3)
pca_result_ = pca.fit_transform(embeddings)[:2000]  # Taking first 2000 points for better visualization

# Perform Hierarchical Clustering (Single Linkage)
Z = linkage(pca_result_, method='single')  # Single-linkage clustering
num_clusters = 6  # Adjust as needed
cluster_labels = fcluster(Z, num_clusters, criterion='maxclust')

# Convert to DataFrame
df = pd.DataFrame(pca_result_, columns=["PC1", "PC2", "PC3"])
df["Cluster"] = cluster_labels.astype(str)  # Convert to string for categorical coloring

# Plot interactive 3D scatter plot
fig = px.scatter_3d(df, x="PC1", y="PC2", z="PC3", opacity=0.6, 
                     title="Interactive 3D PCA Visualization of SCOTUS Embeddings (Hierarchical Clustering - Single Linkage)",
                     color=df["Cluster"])

fig.update_traces(marker=dict(size=3, line=dict(width=0.5, color="black")))
fig.show()


In [4]:
import numpy as np
import plotly.express as px
import pandas as pd
from sklearn.decomposition import KernelPCA
from sklearn.cluster import SpectralClustering

# Load embeddings
output_path = "/home/maria/Downloads/scotus_embeddings.npy"
embeddings = np.load(output_path)[:2000]

# Apply Kernel PCA for nonlinear dimensionality reduction
kpca = KernelPCA(n_components=3, kernel='sigmoid', gamma=0.01)  # You can tweak gamma for better separation
kpca_result_ = kpca.fit_transform(embeddings)  # Taking first 2000 points for better visualization

# Perform Spectral Clustering
num_clusters = 3 # Change if needed
clustering = SpectralClustering(n_clusters=num_clusters, affinity='nearest_neighbors', assign_labels='kmeans', random_state=42)
cluster_labels = clustering.fit_predict(kpca_result_)

# Convert to DataFrame
df = pd.DataFrame(kpca_result_, columns=["PC1", "PC2", "PC3"])
df["Cluster"] = cluster_labels.astype(str)  # Convert to string for categorical coloring

# Plot interactive 3D scatter plot
fig = px.scatter_3d(df, x="PC1", y="PC2", z="PC3", opacity=0.6, 
                     title="Interactive 3D KernelPCA Visualization of SCOTUS Embeddings (Spectral Clustering)",
                     color=df["Cluster"])

fig.update_traces(marker=dict(size=3, line=dict(width=0.5, color="black")))
fig.show()


In [5]:
import numpy as np
import plotly.express as px
import pandas as pd
from sklearn.decomposition import KernelPCA
from sklearn.cluster import SpectralClustering

# Load embeddings
output_path = "/home/maria/Downloads/scotus_embeddings.npy"
embeddings = np.load(output_path).T

# Apply Kernel PCA for nonlinear dimensionality reduction to 2D
kpca = KernelPCA(n_components=3, kernel='cosine', gamma=0.01)  # You can tweak gamma for better separation
kpca_result_ = kpca.fit_transform(embeddings) # Taking first 2000 points for better visualization

# Perform Spectral Clustering
num_clusters = 6  # Change if needed
clustering = SpectralClustering(n_clusters=num_clusters, affinity='nearest_neighbors', assign_labels='kmeans', random_state=42)
cluster_labels = clustering.fit_predict(kpca_result_)

# Convert to DataFrame
df = pd.DataFrame(kpca_result_, columns=["PC1", "PC2"])
df["Cluster"] = cluster_labels.astype(str)  # Convert to string for categorical coloring

# Plot interactive 2D scatter plot
fig = px.scatter(df, x="PC1", y="PC2", opacity=0.7, 
                 title="2D KernelPCA Visualization of SCOTUS Embeddings (Spectral Clustering)",
                 color=df["Cluster"])

fig.update_traces(marker=dict(size=5, line=dict(width=0.5, color="black")))
fig.show()


ValueError: Shape of passed values is (3072, 3), indices imply (3072, 2)

In [35]:
import numpy as np
import plotly.express as px
import pandas as pd
import umap
from sklearn.cluster import SpectralClustering

# Load embeddings
texts=pd.read_csv("/home/maria/Neurogarage2/scotusCebra/scotus_with_summaries_ordered_full.csv").loc[:5000, "summary"]
output_path = "/home/maria/Downloads/scotus_embeddings.npy"
#output_path="/home/maria/Downloads/scotus_embeddings_small.npy"
embeddings = np.load(output_path)[:5000]

# Apply UMAP for nonlinear dimensionality reduction to 3D
umap_model = umap.UMAP(n_components=3, n_neighbors=15, min_dist=0.1, metric='euclidean', random_state=42)
umap_result_ = umap_model.fit_transform(embeddings)  # Taking first 2000 points for better visualization

# Perform Spectral Clustering
num_clusters = 5 # Adjust as needed
clustering = SpectralClustering(n_clusters=num_clusters, affinity='nearest_neighbors', assign_labels='kmeans', random_state=42)
cluster_labels = clustering.fit_predict(umap_result_)

# Convert to DataFrame
df = pd.DataFrame(umap_result_, columns=["UMAP1", "UMAP2", "UMAP3"])
df["Cluster"] = cluster_labels.astype(str)  # Convert to string for categorical coloring

# Plot interactive 3D scatter plot
fig = px.scatter_3d(df, x="UMAP1", y="UMAP2", z="UMAP3", opacity=0.7, 
                     title="3D UMAP Visualization of SCOTUS Embeddings (Spectral Clustering)",
                     color=df["Cluster"])

fig.update_traces(marker=dict(size=3, line=dict(width=0.5, color="black")))
fig.show()



n_jobs value 1 overridden to 1 by setting random_state. Use no seed for parallelism.


Graph is not fully connected, spectral embedding may not work as expected.



In [None]:
import numpy as np
import plotly.express as px
import pandas as pd
import umap
from sklearn.cluster import SpectralClustering

# Load embeddings
texts=pd.read_csv("/home/maria/Neurogarage2/scotusCebra/scotus_with_summaries_ordered_full.csv").loc[:100]
output_path = "/home/maria/Downloads/scotus_embeddings.npy"
#output_path="/home/maria/Downloads/scotus_embeddings_small.npy"
embeddings = np.load(output_path)[:1000]
print(embeddings.shape)
# Reduce to 50D with PCA first
pca = PCA(n_components=600)
pca_result = pca.fit_transform(embeddings)
print(pca_result.shape)
# Then apply UMAP to get a 3D embedding
umap_model = umap.UMAP(n_components=3, n_neighbors=50, min_dist=0.5, metric='chebyshev', random_state=42)
umap_result = umap_model.fit_transform(pca_result)
print(umap_result.shape)

# Perform Spectral Clustering
num_clusters = 5 # Adjust as needed
clustering = SpectralClustering(n_clusters=num_clusters, affinity='nearest_neighbors', assign_labels='kmeans', random_state=42)
cluster_labels = clustering.fit_predict(umap_result)

# Convert to DataFrame
df = pd.DataFrame(umap_result, columns=["UMAP1", "UMAP2", "UMAP3"])
df["Cluster"] = cluster_labels.astype(str)  # Convert to string for categorical coloring

# Plot interactive 3D scatter plot
fig = px.scatter_3d(df, x="UMAP1", y="UMAP2", z="UMAP3", opacity=0.7, 
                     title="3D UMAP Visualization of SCOTUS Embeddings (Spectral Clustering)",
                     color=df["Cluster"])

fig.update_traces(marker=dict(size=3, line=dict(width=0.5, color="black")))
fig.show()


(1000, 3072)
(1000, 600)



n_jobs value 1 overridden to 1 by setting random_state. Use no seed for parallelism.



(1000, 3)
