In [2]:
import numpy as np
import pandas as pd
from psmiles.psmiles import PolymerSmiles
from rdkit import Chem
from importlib import util
from rdkit.Chem import rdFingerprintGenerator

random_seed = 123
np.random.seed(random_seed)

In [None]:


class Smiles(PolymerSmiles):
    def __init__(self, smiles: str, deactivate_warnings: bool = True):
        self.smiles = smiles
        psmiles = self.smiles
        super().__init__(psmiles, deactivate_warnings)

    def can_molecule(self):
        mol = Chem.MolFromSmiles(self.psmiles)
        return Chem.MolToSmiles(mol)

    @property
    def fingerprint_miniLM(self) -> np.ndarray:
        assert util.find_spec("sentence_transformers"), (
            "MiniLM fingerprints require the 'sentence-transformers' Python package."
            " Please install with "
            "`pip install 'psmiles[polyBERT]@git+https://github.com/"
            "Ramprasad-Group/psmiles.git'` "
            "Or "
            "`poetry add git+https://github.com/"
            "Ramprasad-Group/psmiles.git -E polyBERT` "
        )

        can_smiles = self.can_molecule()

        from sentence_transformers import SentenceTransformer

        polyBERT = SentenceTransformer("all-MiniLM-L6-v2")

        return polyBERT.encode([can_smiles], show_progress_bar=False)[0]
    
    @property
    def fingerprint_polyBERT(self) -> np.ndarray:
        assert util.find_spec("sentence_transformers"), (
            "PolyBERT fingerprints require the 'sentence-transformers' Python package."
            " Please install with "
            "`pip install 'psmiles[polyBERT]@git+https://github.com/"
            "Ramprasad-Group/psmiles.git'` "
            "Or "
            "`poetry add git+https://github.com/"
            "Ramprasad-Group/psmiles.git -E polyBERT` "
        )

        can_smiles = self.can_molecule()

        from sentence_transformers import SentenceTransformer

        polyBERT = SentenceTransformer("all-MiniLM-L6-v2")

        return polyBERT.encode([can_smiles], show_progress_bar=False)[0]
    
    @property
    def fingerprint_circular(self) -> np.ndarray:
        """Compute the circular (Morgen) count fingerprint
        
        Returns:
            numpy.ndarray: circular fingerprint
        """

        fp_gen = rdFingerprintGenerator.GetMorganGenerator()
        return fp_gen.GetCountFingerprintAsNumPy(
            Chem.MolFromSmiles(self.smiles)
        ).astype(int)

In [None]:


def generate_fingerprint(smiles):
    sm = Smiles(smiles)
    fingerprint = np.array(sm.fingerprint)
    if fingerprint.size > 0 and isinstance(fingerprint[0], str):
        fingerprint = np.array([float(x) for x in fingerprint])
    return fingerprint

In [None]:

def generate_fingerprint_miniLM(smiles):
    ps = Smiles(smiles)
    fingerprint = np.array(ps.fingerprint_miniLM)
    if fingerprint.size > 0 and isinstance(fingerprint[0], str):
        fingerprint = np.array([float(x) for x in fingerprint])
    return fingerprint

def generate_fingerprint_circular(smiles):
    ps = Smiles(smiles)
    fingerprint = np.array(ps.fingerprint_circular)
    if fingerprint.size > 0 and isinstance(fingerprint[0], str):
        fingerprint = np.array([float(x) for x in fingerprint])
    return fingerprint

def generate_fingerprint_polyBERT(smiles):
    ps = Smiles(smiles)
    fingerprint = np.array(ps.fingerprint_polyBERT)
    if fingerprint.size > 0 and isinstance(fingerprint[0], str):
        fingerprint = np.array([float(x) for x in fingerprint])
    return fingerprint

In [None]:
df = pd.read_csv("qm9.csv")
desired_columns = ["smiles","gap"]
df = df[desired_columns]
df.rename(columns={'gap':'Egc'}, inplace= True)
df["fingerprint_circular"] = df["smiles"].apply(generate_fingerprint_circular)
df["fingerprint_polyBERT"] = df["smiles"].apply(generate_fingerprint_polyBERT)
df["fingerprint_miniLM"] = df["smiles"].apply(generate_fingerprint_miniLM)
df.to_pickle("updated_molecules.pth")