In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.colors as mcolors
import seaborn as sns 
from rdkit import Chem
from rdkit.Chem import AllChem, DataStructs
from rdkit.DataStructs.cDataStructs import ExplicitBitVect
from sklearn.gaussian_process import GaussianProcessRegressor
from tanimoto import FastTanimotoKernel

In [2]:
class ChemCalculator(FastTanimotoKernel):
    def __init__(self, data_set):
        super().__init__()
        self.data_set = data_set
        #self.property = property
        self.fingerprint = []
        self.read_set = pd.read_csv(self.data_set)

    def show_data(self):
        print(self.read_set)

    def select_data(self, smiles, property, number_of_data):
        self.read_set = self.read_set[0:number_of_data]
        self.data_smiles = self.read_set[smiles]
        self.data_property = self.read_set[property]

        self.data_smiles = self.data_smiles.tolist()
        self.data_property = self.data_property.tolist()
        return self.data_smiles, self.data_property
    
    def convert_smiles_to_fingerprint(self, data_smiles, radius=2, nBits=2048):
        self.molecul = Chem.MolFromSmiles(data_smiles)
        self.fp = AllChem.GetMorganFingerprintAsBitVect(self.molecul, radius=radius, nBits=nBits)
        return self.fp
    
    def matrix_fingerprints(self):
        self.fingerprint = []
        if self.data_smiles:
            self.fingerprint = [self.convert_smiles_to_fingerprint(smile) for smile in self.data_smiles]
        else:
            print("No data SMILES selected. Please use select_data method first.")
    
    def get_fingerprints(self):
        if not self.fingerprint:
            print("Fingerprint matrix is empty. Please use matrix_fingerprints method first.")
        return self.fingerprint
    
    def gp_train(self):
        if not self.fingerprint or not self.data_property:
            print("Fingerprint matrix or property data is missing. Please ensure both are available.")
            return None
        self.kernel = FastTanimotoKernel()
        self.gp = GaussianProcessRegressor(kernel=self.kernel,alpha=1e-6, normalize_y=True)
        self.gp.fit(self.fingerprint, self.data_property)
    
    def gp_predict(self, test_smiles):
        self.test_fp = [self.convert_smiles_to_fingerprint(smile) for smile in test_smiles]
        self.X_test = np.array(self.test_fp, dtype=object)
        self.y_pred, self.y_std = self.gp.predict(self.X_test, return_std=True)
        print("Predicted values:", self.y_pred)
        print("Uncertainity:",self.y_std)

    def get_predictions(self):
        return self.y_pred
    
    def get_uncertainty(self):
        return self.y_std


In [3]:
propCal = ChemCalculator("qm9.csv")
propCal.show_data()

            mol_id               smiles          A           B           C  \
0            gdb_1                    C  157.71180  157.709970  157.706990   
1            gdb_2                    N  293.60975  293.541110  191.393970   
2            gdb_3                    O  799.58812  437.903860  282.945450   
3            gdb_4                  C#C    0.00000   35.610036   35.610036   
4            gdb_5                  C#N    0.00000   44.593883   44.593883   
...            ...                  ...        ...         ...         ...   
133880  gdb_133881  C1C2C3C4C5OC14C5N23    3.59483    2.198990    1.904230   
133881  gdb_133882  C1N2C3C2C2C4OC12C34    3.65648    2.142370    1.904390   
133882  gdb_133883  C1N2C3C4C5C2C13CN45    3.67118    2.143140    1.895010   
133883  gdb_133884  C1N2C3C4C5CC13C2C45    3.52845    2.151310    1.865820   
133884  gdb_133885  C1N2C3C4C5OC13C2C45    3.64015    2.217640    1.937930   

            mu  alpha    homo    lumo     gap  ...      zpve   

In [None]:
propCal.select_data(smiles="smiles", property="gap", number_of_data=15000)
propCal.matrix_fingerprints()
propCal.gp_train()



In [5]:
# Test model
data_test = pd.read_csv("qm9.csv")
smiles_test = data_test["smiles"][15001:17001]
propCal.gp_predict(smiles_test)

Predicted values: [0.3439556  0.32275659 0.34393548 ... 0.29433581 0.29644211 0.26013756]
Uncertainity: [0.45154912 0.28869305 0.3849954  ... 0.65169551 0.62895645 0.69224832]
