In [1]:
#imports
import pandas as pd
import numpy as np
from collections import defaultdict
#plotting tools
import seaborn as sn
import matplotlib.pyplot as plt
from tqdm.notebook import tqdm

#stats module
from scipy import stats

#MOL2VEC related imports
import re, gc
from gensim.models import Word2Vec 
from mol2vec.features import mol2alt_sentence, mol2sentence, MolSentence, DfVec, sentences2vec
from gensim.models import word2vec

#rdkit
from rdkit import Chem
from rdkit.Chem import MACCSkeys
from rdkit.Chem import AllChem
from rdkit.Chem import Descriptors

#ML models and splits from sklearn
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import KFold, StratifiedKFold, GridSearchCV
from sklearn.metrics import accuracy_score
from sklearn.naive_bayes import GaussianNB
from sklearn.decomposition import PCA
from sklearn.metrics import precision_recall_fscore_support
from sklearn.preprocessing import MinMaxScaler
from sklearn.neural_network import MLPClassifier

## Processing the main dataset (3000 rows, atc codes dataset).
This notebook Generates RDKit Features for the dataset generated by the pipeline described in the paper.

In [15]:
#helper functions for data generation
#A Replacement Dictionary for 3 incorrect SMI strings
wrong_ones_dict = {
    '[H][N]([H])([H])[Pt](Cl)(Cl)[N]([H])([H])[H]' : 'C1CN2C(=NN=C2C(F)(F)F)CN1C(=O)CC(CC3=CC(=C(C=C3F)F)F)N',
    '[H][N]([H])([H])[Pt]1(OC(=O)C2(CCC2)C(=O)O1)[N]([H])([H])[H]': 'C1CC(C1)(C(=O)O)C(=O)O.[NH2-].[NH2-].[Pt+2]',
    '[H][N]1([H])[C@@H]2CCCC[C@H]2[N]([H])([H])[Pt]11OC(=O)C(=O)O1' : 'C1CCC(C(C1)[NH-])[NH-].C(=O)(C(=O)[O-])[O-].[Pt+4]'
}
def smi2mol(x):
    """
    A function to generate an RDKit Molecule given a SMILES string
    """
    if x in wrong_ones_dict:
        return Chem.MolFromSmiles(wrong_ones_dict[x])
    return Chem.MolFromSmiles(x)

def get_gast(m):
    """
    A function to generate the Gasteiger charge value of an RDKit Molecule
    """
    AllChem.ComputeGasteigerCharges(m)
    return float(m.GetAtomWithIdx(0).GetProp('_GasteigerCharge'))

In [33]:
#create an excel writer
writer = pd.ExcelWriter('dataset_rdkit.xlsx', engine='openpyxl')
#Read the generated dataset
xls = pd.ExcelFile('AACT_final_v4.xlsx', engine = 'openpyxl')

# A list of sheets to be processed
sheets_to_fill = ["fail_common_targets", "fail_common_ki", "fail_common_stdval", "passb1_commontargets", "passb1_common_ki", "passb1_common_stdval", "passb2_commontargets", "passb2_common_ki", "passb2_common_stdval"]
for sheet in xls.sheet_names:
    df = pd.read_excel(xls, sheet)
    df = df[pd.notnull(df['smi'])]
    print(sheet, len(df))
    df['mol'] = df['smi'].apply(smi2mol)
    
    df['mol_w'] = df['mol'].apply(lambda x: float(Descriptors.ExactMolWt(x)))
    df['num_valence_electrons'] = df['mol'].apply(lambda x: float(Descriptors.NumValenceElectrons(x)))
    df['num_heteroatoms'] = df['mol'].apply(lambda x: float(Descriptors.NumHeteroatoms(x)))
    df['gastiger_charges'] = df['mol'].apply(get_gast)
    df['tpsa'] = df['mol'].apply(lambda x: Descriptors.TPSA(x))
    df['h_acceptors'] = df['mol'].apply(lambda x: Descriptors.NumHAcceptors(x))
    df['h_donors'] = df['mol'].apply(lambda x: Descriptors.NumHDonors(x))
    df['n_rotatable_bonds'] = df['mol'].apply(lambda x: Descriptors.NumRotatableBonds(x))
    df['n_rings'] = df['mol'].apply(lambda x: Descriptors.rdMolDescriptors.CalcNumRings(x))
    df['num_of_atoms'] = df['mol'].apply(lambda x: Chem.AddHs(x).GetNumAtoms())
    df['num_of_heavy_atoms'] = df['mol'].apply(lambda x: Chem.AddHs(x).GetNumHeavyAtoms())
    df = df.loc[:, ~df.columns.isin(['mol'])]
    df.replace([np.inf, -np.inf], np.nan, inplace=True)
    col = df.pop("label")
    df.insert(0, col.name, col)
    df.to_excel(writer, sheet_name=sheet, index = False)

writer.save()

fail_common_targets 443
fail_common_ki 443
fail_common_stdval 443
passb1_commontargets 2264
passb1_common_ki 2264
passb1_common_stdval 2264
passb2_commontargets 573




passb2_common_ki 573




passb2_common_stdval 573




p2fail_commontargets 368
p2fail_common_ki 368
p2fail_common_stdval 368
