In [1]:
import numpy as np
from datasets.tensor_storage import TensorStorage
from tqdm import tqdm
from sklearn.decomposition import IncrementalPCA
import torch
import pandas as pd

In [None]:
np.random.seed(42)
torch.manual_seed(42)  

<torch._C.Generator at 0x7f980439c1f0>

In [3]:
store = TensorStorage("storages/mol2vec_dgsm")

for i in tqdm(range(len(store))):
    _ = store[i]
_.shape

100%|██████████| 1576904/1576904 [00:06<00:00, 251213.91it/s]


(1, 300)

In [4]:
metadata_df = store.load_metadata_table()

In [5]:
metadata_df.head() # success - was there error during embedding creation

Unnamed: 0,smiles,success,index,tensor_idx
0,CCO,True,0,0
1,C,True,1,1
2,CO,True,2,2
3,NCCS,True,3,3
4,NCCN,True,4,4


In [6]:
non_error_index = metadata_df[metadata_df['success']==True].index

In [7]:
len(non_error_index), metadata_df.shape[0]

(1575727, 1576904)

In [8]:
for i in tqdm(non_error_index):
    _ = store[i]

100%|██████████| 1575727/1575727 [00:06<00:00, 250266.96it/s]


In [9]:
def load_embeddings_batch(store, non_error_index, batch_size=1000):
    """
    Load embeddings in batches and yield them
    """
    for i in range(0, len(non_error_index), batch_size):
        batch_indices = non_error_index[i:i + batch_size]
        batch_embeddings = []
        
        for idx in batch_indices:
            emb = store[idx]
            if isinstance(emb, torch.Tensor):
                emb = emb.numpy()
            batch_embeddings.append(emb.reshape(-1))
            
        yield np.array(batch_embeddings)

In [11]:
def compute_principal_components(store, non_error_index, batch_size=1000):
    """
    Compute all 300 principal components of the embedding space using IncrementalPCA
    """
    # Initialize IncrementalPCA with full dimensionality
    ipca = IncrementalPCA(n_components=300)  # Get all 300 components
    
    # Fit PCA incrementally
    for batch in tqdm(load_embeddings_batch(store, non_error_index, batch_size), 
                     desc="Computing PCA", 
                     total=len(non_error_index)//batch_size + 1):
        ipca.partial_fit(batch)
    
    # Get explained variance ratio and eigenvectors
    explained_variance_ratio = ipca.explained_variance_ratio_
    eigenvectors = ipca.components_  # Shape will be (300, 300)
    
    return eigenvectors, explained_variance_ratio, ipca

In [12]:
def find_nearest_molecules(store, non_error_index, eigenvectors):
    """
    Find the nearest molecules to each eigenvector using KDTree for efficient nearest neighbor search
    """
    from sklearn.neighbors import KDTree
    
    # First, collect all embeddings into a matrix
    print("Building embeddings matrix...")
    all_embeddings = []
    for idx in tqdm(non_error_index, desc="Loading embeddings"):
        emb = store[idx]
        if isinstance(emb, torch.Tensor):
            emb = emb.numpy()
        emb = emb.reshape(-1)
        
        # Normalize embedding
        emb = emb / np.linalg.norm(emb)
        all_embeddings.append(emb)
    
    all_embeddings = np.array(all_embeddings)
    
    # Print diagnostics for embeddings
    print("\nEmbeddings statistics:")
    print(f"Shape: {all_embeddings.shape}")
    print(f"Mean norm: {np.mean([np.linalg.norm(emb) for emb in all_embeddings])}")
    print(f"Mean: {np.mean(all_embeddings)}")
    print(f"Std: {np.std(all_embeddings)}")
    
    # Normalize eigenvectors
    print("\nEigenvectors statistics before normalization:")
    print(f"Shape: {eigenvectors.shape}")
    print(f"Mean norm: {np.mean([np.linalg.norm(vec) for vec in eigenvectors])}")
    print(f"Mean: {np.mean(eigenvectors)}")
    print(f"Std: {np.std(eigenvectors)}")
    
    normalized_eigenvectors = eigenvectors.copy()
    for i in range(len(normalized_eigenvectors)):
        normalized_eigenvectors[i] = normalized_eigenvectors[i] / np.linalg.norm(normalized_eigenvectors[i])
    
    print("\nEigenvectors statistics after normalization:")
    print(f"Mean norm: {np.mean([np.linalg.norm(vec) for vec in normalized_eigenvectors])}")
    
    # Build KDTree
    print("\nBuilding KDTree...")
    tree = KDTree(all_embeddings)
    
    # Query tree for each eigenvector
    print("Finding nearest neighbors...")
    nearest_molecules = []
    nearest_distances = []
    
    for eigenvector in tqdm(normalized_eigenvectors, desc="Querying KDTree"):
        distances, indices = tree.query(eigenvector.reshape(1, -1), k=5)  # Get top 5 for debugging
        
        # Print first few matches for debugging
        if len(nearest_molecules) < 3:
            print(f"\nEigenvector {len(nearest_molecules)} top matches:")
            for d, i in zip(distances[0], indices[0]):
                orig_idx = non_error_index[i]
                print(f"Distance: {d:.4f}, SMILES: {metadata_df.iloc[orig_idx]['smiles']}")
        
        nearest_molecules.append(non_error_index[indices[0][0]])
        nearest_distances.append(distances[0][0])
    
    return nearest_molecules, nearest_distances


In [14]:
non_error_index = metadata_df[metadata_df['success']==True].index.values

In [15]:
print("Computing principal components...")
eigenvectors, explained_variance_ratio, ipca = compute_principal_components(store, non_error_index)

Computing principal components...


Computing PCA: 100%|██████████| 1576/1576 [01:13<00:00, 21.49it/s]


In [16]:
assert eigenvectors.shape == (300, 300), f"Expected shape (300, 300), got {eigenvectors.shape}"

In [17]:
print("Finding nearest molecules to eigenvectors...")
nearest_molecules, nearest_distances = find_nearest_molecules(store, non_error_index, eigenvectors)

Finding nearest molecules to eigenvectors...
Building embeddings matrix...


Loading embeddings:   0%|          | 0/1575727 [00:00<?, ?it/s]

Loading embeddings: 100%|██████████| 1575727/1575727 [00:14<00:00, 110814.08it/s]



Embeddings statistics:
Shape: (1575727, 300)
Mean norm: 1.0
Mean: 0.0023550980258733034
Std: 0.05768771097064018

Eigenvectors statistics before normalization:
Shape: (300, 300)
Mean norm: 1.0
Mean: 0.0003786748517078929
Std: 0.05773378507156808

Eigenvectors statistics after normalization:
Mean norm: 1.0

Building KDTree...
Finding nearest neighbors...


Querying KDTree:   0%|          | 0/300 [00:00<?, ?it/s]


Eigenvector 0 top matches:
Distance: 1.5871, SMILES: F[P-](F)(F)(F)(F)F
Distance: 1.5957, SMILES: IC(I)=C(I)I
Distance: 1.6127, SMILES: FS(F)(F)(F)(F)F
Distance: 1.6423, SMILES: S=C1SC2=C(S1)SC(=S)S2
Distance: 1.6499, SMILES: ClC12C3(Cl)C4(Cl)C5(Cl)C(Cl)(C1(Cl)C4(Cl)Cl)C2(Cl)C(Cl)(Cl)C35Cl


Querying KDTree:   1%|          | 2/300 [00:00<01:14,  4.00it/s]


Eigenvector 1 top matches:
Distance: 1.2769, SMILES: C
Distance: 1.3089, SMILES: C=C
Distance: 1.3172, SMILES: CC
Distance: 1.3173, SMILES: C1CCCCCCCCCCC1
Distance: 1.3173, SMILES: C1CCCCCCC1


Querying KDTree:   1%|          | 3/300 [00:00<01:26,  3.44it/s]


Eigenvector 2 top matches:
Distance: 1.1935, SMILES: [O-][n+]1onc2cc(Cl)c3nonc3c12
Distance: 1.1940, SMILES: Clc1nc(Cl)c2nc(Cl)nc(Cl)c2n1
Distance: 1.1951, SMILES: Clc1c(Cl)c(Cl)c2Oc3c(Cl)c(Cl)c(Cl)c(Cl)c3Oc2c1Cl
Distance: 1.1996, SMILES: Clc1nc(Cl)nc(Cl)n1
Distance: 1.2027, SMILES: Clc1cc2Oc3c(Cl)c(Cl)c(Cl)c(Cl)c3Oc2cc1Cl


Querying KDTree: 100%|██████████| 300/300 [04:25<00:00,  1.13it/s]


In [18]:
results_df = pd.DataFrame({
        'eigenvector_idx': range(len(nearest_molecules)),
        'nearest_molecule_idx': nearest_molecules,
        'distance': nearest_distances,
        'explained_variance_ratio': explained_variance_ratio,
        'cumulative_variance_ratio': np.cumsum(explained_variance_ratio),
        'smiles': [metadata_df.iloc[idx]['smiles'] for idx in nearest_molecules]
    })

In [19]:
results_df = results_df.sort_values('explained_variance_ratio', ascending=False)

In [20]:
results_df

Unnamed: 0,eigenvector_idx,nearest_molecule_idx,distance,explained_variance_ratio,cumulative_variance_ratio,smiles
0,0,8636,1.587104,9.638863e-01,0.963886,F[P-](F)(F)(F)(F)F
1,1,1,1.276939,1.030605e-02,0.974192,C
2,2,129427,1.193488,6.477860e-03,0.980670,[O-][n+]1onc2cc(Cl)c3nonc3c12
3,3,33671,1.262933,3.138154e-03,0.983808,FC(F)(F)c1ccc(Cl)cc1Cl
4,4,3237,1.243460,2.084765e-03,0.985893,ClP(Cl)(Cl)=O
...,...,...,...,...,...,...
295,295,155,1.396013,9.383385e-07,0.999997,O=C=O
296,296,31,1.377199,9.101521e-07,0.999998,N#N
297,297,9050,1.341068,8.582587e-07,0.999998,ClP(Cl)(Cl)(Cl)Cl
298,298,5261,1.391343,7.972005e-07,0.999999,O[Cl](=O)(=O)=O


In [21]:
nearest_molecules

[8636,
 1,
 129427,
 33671,
 3237,
 19916,
 1446112,
 704526,
 222696,
 179,
 3281,
 191925,
 30321,
 1512599,
 1220,
 15570,
 222696,
 15570,
 884340,
 83240,
 25579,
 11438,
 2081,
 25579,
 17225,
 15736,
 2399,
 11438,
 23727,
 123140,
 277623,
 123163,
 25579,
 4280,
 163595,
 84258,
 243,
 12071,
 1793,
 719576,
 15570,
 704526,
 1091331,
 1793,
 11744,
 1249,
 156,
 56697,
 34706,
 85655,
 2081,
 83,
 155,
 243,
 1564406,
 56697,
 1497,
 11967,
 23727,
 704526,
 2081,
 1091331,
 1249,
 156,
 11967,
 15736,
 86620,
 56697,
 155,
 243,
 704526,
 308,
 2237,
 243,
 11967,
 1220,
 1512230,
 11967,
 29012,
 783219,
 2081,
 1091331,
 56697,
 243,
 1564406,
 2081,
 31369,
 5261,
 15734,
 11967,
 1497,
 145545,
 2081,
 2081,
 1497,
 5216,
 8636,
 1497,
 1091331,
 8636,
 4280,
 1300,
 5216,
 14761,
 12187,
 85655,
 155,
 617,
 18826,
 756865,
 1497,
 155,
 172591,
 11967,
 1439694,
 963,
 3190,
 3190,
 5216,
 8636,
 1793,
 25579,
 10754,
 1340522,
 103068,
 1220,
 145545,
 10862,
 3815,
 

In [21]:
results_df.to_csv('mol_pca_results.csv', index=False)

In [22]:
np.save('m2v_eigenvectors.npy', eigenvectors)