In [None]:
#@title Setup

!git clone https://github.com/molecule-generator-collection/DFT_param_opt.git

model_path = 'DFT_param_opt/model'
selected_index_mask = pickle.load(open(f'{model_path}/mordred_index_mask.pkl', 'rb'))

p_model = 'lightGBM' 
feature = 'All_Desc' 

!pip install lightgbm
!pip install rdkit
!pip install mordred

import numpy as np
import pickle

import matplotlib.pyplot as plt
import seaborn as sns

import pandas as pd
from rdkit import rdBase, Chem, DataStructs
print(rdBase.rdkitVersion) 
from rdkit.Avalon import pyAvalonTools
from rdkit.Chem import AllChem, Draw, rdMolDescriptors
from rdkit.Chem.Draw import rdMolDraw2D
from rdkit.Chem import Descriptors
from rdkit.Chem.Fingerprints import FingerprintMols
from rdkit.Chem.AtomPairs import Pairs, Torsions
from rdkit.ML.Descriptors import MoleculeDescriptors


from mordred import Calculator, descriptors
import lightgbm as lgbm

def smi2descrptor(smi_list, mordred_index_list):
  
  mol_list = [Chem.MolFromSmiles(v) for v in smi_list]
  Morgan_fp_list = np.array([AllChem.GetMorganFingerprintAsBitVect(mol, 2, 2048) for mol in mol_list])
  MACCS_fp_list = np.array([AllChem.GetMACCSKeysFingerprint(mol) for mol in mol_list])
  #topological fingerprintののbit数を合わせるための準備準備
  params = FingerprintMols.FingerprinterDetails()
  params.tgtDensity = 0
  Topol_fp_list = np.array([Chem.Fingerprints.FingerprintMols.FingerprintMol(mol, **params.__dict__) for mol in mol_list])

  #RDkit descriptors (208)
  descriptor_names = [descriptor_name[0] for descriptor_name in Descriptors._descList]
  descriptor_calculation = MoleculeDescriptors.MolecularDescriptorCalculator(descriptor_names)
  RDkit_desc_list = np.array([descriptor_calculation.CalcDescriptors(mol_temp) for mol_temp in mol_list])

  # mordredの記述子計算
  df = pd.DataFrame({'MOL':mol_list})
  # 記述子を計算
  calc = Calculator(descriptors, ignore_3D=True)
  df_descriptors_mordred = calc.pandas(df['MOL'])

  df_descriptors = df_descriptors_mordred.astype(str)
  masks = df_descriptors.apply(lambda d: d.str.contains('[a-zA-Z]' ,na=False))
  df_descriptors = df_descriptors[~masks]
  df_descriptors = df_descriptors.astype(float)

  df_descriptors = df_descriptors.values[:, mordred_index_list]

  All_Desc_list = np.concatenate([Morgan_fp_list, Topol_fp_list, MACCS_fp_list, RDkit_desc_list, df_descriptors], axis = 1)

  return All_Desc_list



def smiles2mu(smi_list):
  desc = smi2descrptor(smi_list, selected_index_mask)

  predicted_mu_list = []
  for fold in range(5):
    loaded_model = pickle.load(open(f'{model_path}/model_{p_model}_{feature}_{fold}.pkl', 'rb'))
    predicted_mu = loaded_model.predict(desc)
    predicted_mu_list.append(predicted_mu)
 
  return np.mean(predicted_mu_list, axis = 0), np.std(predicted_mu_list, axis = 0)

#Prediction


Please prepare a SMILES list for predicting their optimal μ values.

In [None]:
smi_list = ['CC', 'c1ccccc1']

Optimal μ and its S.D. are calculated by the smiles2mu funtion as follows. 

In [None]:
predicted_mu_list, predicted_mu_std_list = smiles2mu(smi_list)

print('Predicted mu list', predicted_mu_list)
print('Predicted mu (std) list', predicted_mu_std_list)

100%|██████████| 2/2 [00:00<00:00,  2.03it/s]


Predicted mu list [0.32931206 0.31471153]
Predicted mu (std) list [0.00597792 0.00986421]
