In [1]:
import os
from pathlib import Path
import json
import numpy as np
import pandas as pd
from tqdm.auto import tqdm
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
import plotly.express as px
import torch

In [2]:
root_dir = Path("/home/max/Desktop/python_projects/SimJeb/")
metadata_dir = root_dir / "SimJEB_metadata"
sdf_cluster_dir = root_dir / "SimJEB_cluster_distances"

In [3]:
df = pd.read_csv(metadata_dir / "all_bracket_metadata.csv")

In [4]:
df.head()

Unnamed: 0,id,num_vertices,num_faces,volume,surface_area,average_edge_length,genus,max_ver_xdisp,max_ver_ydisp,max_ver_zdisp,...,category,num_tets,mass,download_file,link_name,author,author_id,test_split_0,test_split_1,test_split_2
0,0,42360,84764,307642.703591,73622.298369,1.347077,12.0,0.176923,0.073638,0.288827,...,block,570111,1.375163,FINAL BRACKET AARON WEISSBART 8.9.2013 1930 pa...,ge-bracket-001-1,Aaron Weissbart,aaron.weissbart,False,True,False
1,4,35622,71368,104015.598723,39149.667547,1.048514,32.0,0.687567,0.426113,1.059176,...,beam,431759,0.46495,bracket.STEP,ripple-bracket-1,simon,simon-240,False,False,False
2,6,35186,70532,114996.439753,58499.675375,1.30674,41.0,1.180213,2.789576,3.713256,...,block,314815,0.514034,10813 new GE bracket.IGS,ge-bracket-26,Amartesh Sehgal,amartesh.sehgal-1,False,False,False
3,8,24577,49174,89326.268569,30458.408764,1.130023,6.0,1.044753,0.299482,1.285331,...,beam,293971,0.399288,bracket_15.igs,ge-engine-bracket-15-1,Mandli Peter,mandli.peter,False,False,False
4,9,38322,76720,78486.321312,38243.142004,0.989507,20.0,1.391756,0.360266,1.927059,...,beam,481030,0.350834,EngineBracketModified_PTJ.igs,modified-engine-bracket-2,Penn,penn-1,False,False,False


In [5]:
len(df)

381

In [6]:
with open(metadata_dir / "outliers.txt") as f:
    outliers = f.readlines()
outliers = [int(e.strip()) for e in outliers]

In [7]:
len(outliers)

34

In [8]:
df = df[~df['id'].isin(outliers)]

In [9]:
len(df)

347

In [15]:
sdf = []
for id_ in tqdm(df.id):
    path = sdf_cluster_dir / f"{id_}.npy"
    sdf.append(np.load(path).reshape(1,-1))
sdf = np.vstack(sdf)

  0%|          | 0/347 [00:00<?, ?it/s]

# PC distances dim reduction

In [18]:
sdf.shape

(347, 1000000)

In [19]:
pca = PCA(n_components=2)
embeddings_pca = pca.fit_transform(sdf)

In [20]:
embeddings_pca.shape

(347, 2)

In [23]:
pca.explained_variance_ratio_

array([0.37870489, 0.16133472])

In [21]:
df["component_0"] = embeddings_pca[:,0]
df["component_1"] = embeddings_pca[:,1]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["component_0"] = embeddings_pca[:,0]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["component_1"] = embeddings_pca[:,1]


In [25]:
fig = px.scatter(
    df,
    x='component_0',
    y='component_1',
    color='category',
    hover_data=['id'],
    title='PCA of SimJEB geometries based on pointcloud distances',
    labels={'X': 'component_0', 'Y': 'component_1', 'Type': 'category', 'ID': 'ID'}
)

# Save the plot to an HTML file
fig.write_html(metadata_dir / "pointcloud_distance_pca_categories.html")

In [28]:
embeddings = TSNE(n_components=2, learning_rate='auto', init='random', perplexity=3).fit_transform(sdf)

In [29]:
df["tsne_component_0"] = embeddings[:,0]
df["tsne_component_1"] = embeddings[:,1]



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [30]:
fig = px.scatter(
    df,
    x='tsne_component_0',
    y='tsne_component_1',
    color='category',
    hover_data=['id'],
    title='tSNE of SimJEB geometries based on pointcloud distances',
    labels={'X': 'component_0', 'Y': 'component_1', 'Type': 'category', 'ID': 'ID'}
)

# Save the plot to an HTML file
fig.write_html(metadata_dir / "pointcloud_distance_tsne_categories.html")

# DeepSDF embeddings

In [10]:
codes_dir = root_dir / "experiments/mixed/LatentCodes"

In [11]:
pth_path = codes_dir / "latest.pth"

In [12]:
data = torch.load(pth_path)

In [18]:
data["latent_codes"]["weight"].shape

torch.Size([294, 64])

In [19]:
with open(metadata_dir / "train.txt") as f:
    train = f.readlines()
train = [int(e.strip()) for e in train]

In [20]:
len(train)

294

In [24]:
df_train = df[df['id'].isin(train)]

In [25]:
len(df_train)

294

In [21]:
pca = PCA(n_components=2)
transform = pca.fit(data["latent_codes"]["weight"])

In [36]:
pca.explained_variance_ratio_

array([0.14105572, 0.07826261])

In [33]:
embeddings = transform.transform(data["latent_codes"]["weight"])
df_embeddings = pd.DataFrame({"id": train, "component_0": embeddings[:,0], "component_1": embeddings[:,1]})
df_tmp = df_train.merge(df_embeddings)

In [35]:
fig = px.scatter(
    df_tmp,
    x='component_0',
    y='component_1',
    color='category',
    hover_data=['id'],
    title='PCA of SimJEB geometries based on DeepSDF embeddings',
    labels={'X': 'component_0', 'Y': 'component_1', 'Type': 'category', 'ID': 'ID'}
)

# Save the plot to an HTML file
fig.write_html(metadata_dir / "deepsdf_categories.html")

In [None]:
for i in range(100, 1000, 100):
    pth_path = codes_dir / f"{i}.pth"
    codes = torch.load(pth_path)["latent_codes"]["weight"]
    embeddings = 