In [7]:
import numpy as np
import pandas as pd
from psmiles.psmiles import PolymerSmiles
from rdkit import Chem
from importlib import util


random_seed = 123
np.random.seed(random_seed)

In [2]:
class PSmiles(PolymerSmiles):
    @property
    def fingerprint_miniLM(self) -> np.ndarray:
        """Compute the miniLM fingerprint
        """
        assert util.find_spec("sentence_transformers"), (
            "MiniLM fingerprints require the 'sentence-transformers' Python package."
            " Please install with "
            "`pip install 'psmiles[polyBERT]@git+https://github.com/"
            "Ramprasad-Group/psmiles.git'` "
            "Or "
            "`poetry add git+https://github.com/"
            "Ramprasad-Group/psmiles.git -E polyBERT` "
        )

        from sentence_transformers import SentenceTransformer

        polyBERT = SentenceTransformer("all-MiniLM-L6-v2")

        return polyBERT.encode([self.canonicalize.psmiles], show_progress_bar=False)[0]
    

In [3]:

def generate_fingerprint_miniLM(smiles):
    ps = PSmiles(smiles)
    fingerprint = np.array(ps.fingerprint_miniLM)
    if fingerprint.size > 0 and isinstance(fingerprint[0], str):
        fingerprint = np.array([float(x) for x in fingerprint])
    return fingerprint

def generate_fingerprint_circular(smiles):
    ps = PSmiles(smiles)
    fingerprint = np.array(ps.fingerprint_circular)
    if fingerprint.size > 0 and isinstance(fingerprint[0], str):
        fingerprint = np.array([float(x) for x in fingerprint])
    return fingerprint

def generate_fingerprint_polyBERT(smiles):
    ps = PSmiles(smiles)
    fingerprint = np.array(ps.fingerprint_polyBERT)
    if fingerprint.size > 0 and isinstance(fingerprint[0], str):
        fingerprint = np.array([float(x) for x in fingerprint])
    return fingerprint

In [4]:
df = pd.read_csv('polymers.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,smiles,property,value
0,0,[*]CC([*])C,Eat,-5.14
1,1,[*]CC([*])F,Eat,-5.18
2,2,[*]CC([*])(F)F,Eat,-5.21
3,3,[*]C(F)C([*])(F)F,Eat,-5.11
4,4,[*]CCC(F)(F)C([*])(F)F,Eat,-5.21


In [5]:
df = pd.read_csv('polymers.csv')
df = df[df['property'] == 'Egc']
desired_columns = ['smiles','value']
df = df[desired_columns]
df.rename(columns={'value':'Egc'}, inplace= True)
df.head()


Unnamed: 0,smiles,Egc
822,[*]C[*],6.8972
823,[*]CC([*])C,6.5196
824,[*]CC([*])CC,6.517
825,[*]CC([*])CCC,6.7336
826,[*]CC([*])CC(C)C,6.7394


In [6]:
df["fingerprint_circular"] = df["smiles"].apply(generate_fingerprint_circular)
df["fingerprint_polyBERT"] = df["smiles"].apply(generate_fingerprint_polyBERT)
df["fingerprint_miniLM"] = df["smiles"].apply(generate_fingerprint_miniLM)
df.to_pickle("updated_polymers.pth")
