In [2]:
import os
import sys
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt
import seaborn as sns
pd.set_option('display.max_columns', 500)
pd.set_option('display.max_rows', 500)
pd.set_option('display.width', 1000)

from rdkit import Chem
from rdkit.Chem import Descriptors, rdMolDescriptors

In [3]:
# Load cleaned data
df = pd.read_csv('./data/Dopamine_D2_rdkit.csv', low_memory=False)


## Generate molecular descriptors with RDKit

In [4]:
# Function to turn SMILES to molecules and calculate descriptors
def mol_descriptors(molecules, verbose=False):

    # SMILES to molecules
    molData = []
    for x in molecules:
        mol = Chem.MolFromSmiles(x)
        molData.append(mol)
        
    # Calculate descriptors 
    storage = np.arange(1,1)
    i=0
    
    for y in molData:
        MolWt = Descriptors.MolWt(y)
        NumHDonors = Descriptors.NumHDonors(y)
        NumHAccept = Descriptors.NumHAcceptors(y)
        LogP = Descriptors.MolLogP(y)
        FractionCSP3 = rdMolDescriptors.CalcFractionCSP3(y)
        
        insert = np.array([MolWt, NumHDonors, NumHAccept, LogP, FractionCSP3])
        
        if (i==0):
            DescriptorData = insert
        else:
            DescriptorData = np.vstack([DescriptorData, insert])
        i+=1
        
    column_names = ['Molecular Weight', 'Number of Hydrogen Donors', 'Number of Hydrogen Acceptors', 'Lipophilicity', 'Fraction C sp3']
    Bind_Descriptors = pd.DataFrame(DescriptorData,columns=column_names)
    
    return Bind_Descriptors

In [5]:
df_descriptors = mol_descriptors(df['canonical_smiles'])

In [6]:
df_descriptors

Unnamed: 0,Molecular Weight,Number of Hydrogen Donors,Number of Hydrogen Acceptors,Lipophilicity,Fraction C sp3
0,342.446,0.0,4.0,3.37000,0.272727
1,360.461,0.0,5.0,3.47440,0.272727
2,352.460,2.0,5.0,-0.35120,0.812500
3,352.460,2.0,5.0,-0.35120,0.812500
4,352.460,2.0,5.0,-0.35120,0.812500
...,...,...,...,...,...
10947,454.014,1.0,4.0,4.60240,0.346154
10948,418.541,3.0,4.0,3.73432,0.360000
10949,426.908,2.0,6.0,-0.63066,0.238095
10950,432.518,2.0,7.0,2.49260,0.529412


Need to figure out whether the entries where pchembl_values are Nan should be removed. and add morgan fingerprints :
AllChem.GetMorganFingerprintAsBitVect(mol, 2, nBits=1024) # Morgan FP


Then do EDA on dataset?