In [1]:
import pickle
import numpy as np
from rdkit import Chem
import rdkit.Chem.QED
import torch
from coati.models.io.coati import load_e3gnn_smiles_clip_e2e
from coati.models.regression.basic_due import basic_due
from coati.common.util import batch_indexable
import csv

# This recalls a coati model
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Device:", DEVICE)

Device: cpu


## Load Model

"grande_closed.pkl" achieved overall the best performance at generation and SMILES-to-SMILES tasks.

In [2]:
encoder, tokenizer = load_e3gnn_smiles_clip_e2e(
    freeze=True,
    device=DEVICE,
    # model = grande_closed 
    doc_url="s3://terray-public/models/grande_closed.pkl",
)

Loading model from s3://terray-public/models/grande_closed.pkl
Downloading models/grande_closed.pkl from terray-public
File downloaded successfully to ./models/grande_closed.pkl


RuntimeError: Attempting to deserialize object on a CUDA device but torch.cuda.is_available() is False. If you are running on a CPU-only machine, please use torch.load with map_location=torch.device('cpu') to map your storages to the CPU.

## Load ChEMBL data
We use 100k molecules from ChEMBL dataset, which contains smile strings.


In [None]:
import random

from coati.common.s3 import cache_read

# load Chembl dataset smile strings.
with cache_read("s3://terray-public/datasets/chembl_canonical_smiles.pkl", "rb") as f:
    chembl_canonical_smiles = pickle.loads(f.read(), encoding="UTF-8")

# for our example, we will use a small subset of the data
random.shuffle(chembl_canonical_smiles)
chembl_subset = chembl_canonical_smiles[:10_000]
chembl_subset = [{"smiles": s, "source": "chembl_mols"} for s in chembl_subset]

## Store Embeddings
Smiles strings are converted to embeddings using the model and both are stored in a dictionary and .csv file.

In [None]:
def smiles_to_embedding(
    records,
    encoder=encoder,
    tokenizer=tokenizer,
    batch_size=128,
    score=True,
    smiles_field="smiles",
    csv_file_path="smiles_embeddings.csv",
):

    print("Converting smiles to embeddings")
    problem_smiles = []  # List to store SMILES that cause exceptions

    with open(csv_file_path, mode='w', newline='') as file:
        writer = csv.writer(file)
        writer.writerow(['smiles', 'solubility', 'embeddings'])  # Write CSV header

    batch_iter = batch_indexable(records, batch_size)
    num_batches = len(records) // batch_size

    with torch.no_grad():
        for i, batch in tqdm(enumerate(batch_iter), total=num_batches):
            try:
                batch_mols = [Chem.MolFromSmiles(row[smiles_field]) for row in batch]
                batch_smiles = [Chem.MolToSmiles(m) for m in batch_mols]
                batch_tokens = torch.tensor(
                    [
                        tokenizer.tokenize_text("[SMILES]" + s + "[STOP]", pad=True)
                        if s != "*"
                        else tokenizer.tokenize_text("[SMILES]C[STOP]", pad=True)
                        for s in batch_smiles
                    ],
                    device=encoder.device,
                    dtype=torch.int,
                )
                batch_embeds = encoder.encode_tokens(batch_tokens, tokenizer)
                
                # write to csv
                with open(csv_file_path, mode='a', newline='') as file:
                    writer = csv.writer(file)
                    for k, _ in enumerate(batch):
                        emb_str = ','.join(map(str, batch_embeds[k].cpu().numpy().flatten()))
                        # Assuming 'solubility' is directly accessible for demonstration
                        solubility = batch[k].get('solubility', '')  
                        writer.writerow([batch_smiles[k], solubility, emb_str])

            except Exception as Ex:
                  print(f"Error processing SMILES '{batch_smiles[k]}': {Ex}")
                  problem_smiles.append(batch_smiles[k])  # Log the problematic SMILES
    
    # Optionally, save the problematic SMILES for review
    with open("problematic_smiles.txt", "w") as f:
        for smi in problem_smiles:
            f.write(f"{smi}\n")

In [None]:
smiles_to_embedding(chembl_subset, encoder, tokenizer, batch_size=128, score=True, smiles_field="smiles", csv_file_path="smiles_embeddings.csv")

# Explanation Techniques
using explanation techniques to show, if the model's embedding is able to capture the chemical properties of the molecules and similar molecules have similar embeddings.

## t-SNE

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.manifold import TSNE

# Load the embeddings and smile strings from the CSV file
df = pd.read_csv("smiles_embeddings.csv")

# Extract the embeddings and smile strings
embeddings = df["Embeddings"].apply(lambda x: np.fromstring(x[1:-1], sep=","))
smiles = df["SMILES"]

# Perform t-SNE dimensionality reduction
tsne = TSNE(n_components=2, random_state=42)
embeddings_tsne = tsne.fit_transform(embeddings.tolist())

# Create the t-SNE plot
plt.figure(figsize=(10, 8))
plt.scatter(embeddings_tsne[:, 0], embeddings_tsne[:, 1])
plt.title("t-SNE Plot of Embeddings")
plt.xlabel("t-SNE Dimension 1")
plt.ylabel("t-SNE Dimension 2")
plt.show()