In [None]:
#| hide
%load_ext autoreload
%autoreload 2

# molcluster
> Cluster molecules using kmeans, butina or HDBSCAN.

## How to use

You can use any function to generate descriptors for the molecules in the dataset. For instance, we could use [Morgan fingerprints](https://pubs.acs.org/doi/10.1021/ci100050t) from [RDkit](https://www.rdkit.org/docs/GettingStartedInPython.html) to generate a vector of 1024 bits for each molecule. 

In [None]:
#| hide
from rdkit import Chem
import numpy as np
import pandas as pd

In [None]:
from molcluster.unsupervised_learning.clustering import KMeansClustering, HDBSCANClustering, ButinaClustering
from molcluster.unsupervised_learning.transform import UMAPTransform, PCATransform

In [None]:
data = pd.read_csv('data/fxa_processed.csv')

In [None]:
X = np.array([Chem.AllChem.GetMorganFingerprintAsBitVect(x, radius=1024) for x in list(map(Chem.MolFromSmiles, data.processed_smiles.values))])

# Dimensionality reduction

## Principal component analysis (PCA)

In [None]:
pca_reducer = PCATransform(X)

In [None]:
pca_embeddings = pca_reducer.reduce(n_components=2)
pca_embeddings[0:5]

## UMAP

In [None]:
umap_reducer = UMAPTransform(X)

In [None]:
umap_embeddings = umap_reducer.reduce(n_neighbors=10, min_dist=0.25, metric='euclidean')
umap_embeddings[0:5]

## Kmeans clustering with 10 clusters

In [None]:
clustering_kmeans = KMeansClustering(X)
labels = clustering_kmeans.cluster(n_clusters=10)
labels[0:5]

## Using the elbow method to select the optimal number of clusters

In [None]:
clustering_kmeans.elbow_method(n_clusters=np.arange(2, 20))

## Butina clustering with similarity threshold > 0.7

In [None]:
mol_list = data.processed_smiles.values

In [None]:
clustering_butina = ButinaClustering(mol_list)
labels = clustering_butina.cluster(sim_cutoff=0.7)
labels[0:5]

## HDBSCAN clustering

In [None]:
clustering_hdbscan = HDBSCANClustering(X)
labels = clustering_hdbscan.cluster(min_cluster_size=5,min_samples=1,metric='euclidean')

In [None]:
np.unique(labels)[0:5]

In [None]:
#| hide
from nbdev import nbdev_export
nbdev_export()