In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.colors as mcolors
import seaborn as sns 
from rdkit import Chem
from rdkit.Chem import AllChem, DataStructs
from rdkit.DataStructs import TanimotoSimilarity
from rdkit.DataStructs.cDataStructs import ExplicitBitVect
from rdkit.SimDivFilters.rdSimDivPickers import MaxMinPicker
from sklearn.gaussian_process import GaussianProcessRegressor
from tanimoto import FastTanimotoKernel

In [None]:
class ChemCalculator(FastTanimotoKernel):
    def __init__(self, data_set):
        super().__init__()
        self.data_set = data_set
        #self.property = property
        self.fingerprint = []
        self.read_set = pd.read_csv(self.data_set)

    def show_data(self):
        print(self.read_set)

    def select_data(self, smiles, property, number_of_data):
        self.read_set = self.read_set[0:number_of_data] # poolsize
        self.data_smiles = self.read_set[smiles]
        self.data_property = self.read_set[property]

        self.data_smiles = self.data_smiles.tolist()
        self.data_property = self.data_property.tolist()
        return self.data_smiles, self.data_property
    
    def convert_smiles_to_fingerprint(self, data_smiles, radius=2, nBits=2048):
        self.molecul = Chem.MolFromSmiles(data_smiles)
        self.fp = AllChem.GetMorganFingerprintAsBitVect(self.molecul, radius=radius, nBits=nBits)
        return self.fp
    
    def matrix_fingerprints(self):
        self.fingerprint = []
        if self.data_smiles:
            self.fingerprint = [self.convert_smiles_to_fingerprint(smile) for smile in self.data_smiles]
        else:
            print("No data SMILES selected. Please use select_data method first.")
    
    def distance_tanimoto(self, i, j, fps = None):
        if fps == None:
            fps = self.fingerprint
        return 1.0 - DataStructs.TanimotoSimilarity(fps[i],fps[j])
    
    def most_diverse_data(self):
        if self.fingerprint:
            self.n_total = len(self.fingerprint)
            self.subset_size = 9000
            self.ids = MaxMinPicker().LazyPick(self.distance_tanimoto,self.n_total,self.subset_size)
            self.ids = list(self.ids)

            self.fingerprint = [self.fingerprint[id] for id in self.ids]
            self.property = [self.data_property[id] for id in self.ids]
            return self.fingerprint
        else:
            print("No fingerprints. Please verify and use matrix_fingerprints().")

    def get_fingerprints(self):
        if not self.fingerprint:
            print("Fingerprint matrix is empty. Please use matrix_fingerprints method first.")
        return self.fingerprint
    
    def gp_train(self):
        if not self.fingerprint or not self.property:
            print("Fingerprint matrix or property data is missing. Please ensure both are available.")
            return None
        self.kernel = FastTanimotoKernel()
        self.gp = GaussianProcessRegressor(kernel=self.kernel,alpha=1e-6, n_restarts_optimizer=10)
        self.gp.fit(self.fingerprint, self.property)
    
    def gp_predict(self, test_smiles):
        self.test_fp = [self.convert_smiles_to_fingerprint(smile) for smile in test_smiles]
        self.X_test = np.array(self.test_fp, dtype=object)
        self.y_pred, self.y_std = self.gp.predict(self.X_test, return_std=True)
        print("Predicted values:", self.y_pred)
        print("Uncertainity:",self.y_std)

    def get_predictions(self):
        return self.y_pred
    
    def get_uncertainty(self):
        return self.y_std

In [None]:
propCal = ChemCalculator("qm9.csv")
propCal.show_data()

In [None]:
propCal.select_data(smiles="smiles", property="gap", number_of_data=25000)
propCal.matrix_fingerprints()
propCal.most_diverse_data()
propCal.gp_train()

In [None]:
# Test model
data_test = pd.read_csv("qm9.csv")
smiles_test = data_test["smiles"][15001:16001]
propCal.gp_predict(smiles_test)