# Pulling Molecular Descriptors form CSV

In [9]:
# converting descriptors into a dataframe 
data = pd.read_csv("final_df_descriptos.csv")
data.head()

Unnamed: 0,Standardised_Smiles,AM1_E,AM1_Eele,AM1_HF,AM1_HOMO,AM1_IP,AM1_LUMO,AM1_dipole,ASA,ASA+,...,vsurf_Wp3,vsurf_Wp4,vsurf_Wp5,vsurf_Wp6,vsurf_Wp7,vsurf_Wp8,weinerPath,weinerPol,zagreb,Target
0,Oc1c(Cl)cc(Cl)cc1Sc1cc(Cl)cc(Cl)c1O,-91061.594,-490900.38,-54.180599,-9.08072,9.08072,-0.86114,1.984315,493.96982,116.66142,...,59.125,3.75,0.75,0.0,0.0,0.0,688,30,98,0
1,CCCCCC1C(=O)CCC1CC(=O)O,-64010.746,-377504.44,-181.89195,-10.37975,10.37975,0.7611,1.84289,458.93463,320.75201,...,31.375,6.625,2.75,0.625,0.125,0.0,400,17,68,0
2,Cc1cc(=O)n(-c2ccccc2)n1C,-53151.84,-302296.66,64.051399,-8.76044,8.76044,-0.1718,4.432582,392.4848,232.39806,...,16.0,0.0,0.0,0.0,0.0,0.0,284,20,72,0
3,O=C(C=CC=Cc1ccc2c(c1)OCO2)N1CCCCC1,-83131.195,-527420.63,-51.748039,-8.66869,8.66869,-0.80507,2.895411,548.17822,370.79236,...,20.0,0.0,0.0,0.0,0.0,0.0,1131,27,106,0
4,O=c1ccc2cc(OC3OC(CO)C(O)C(O)C3O)c(O)cc2o1,-115882.23,-774507.5,-348.08813,-9.29644,9.29644,-1.18028,4.948146,522.75287,310.44724,...,94.625,18.375,4.75,0.625,0.0,0.0,1339,42,128,0


In [10]:
# showing the number of compounds where target = 0 and target = 1
print("Inactive Compounds:{}".format(data[(data["Target"]==0)].shape[0]))
print("Active Compounds:{}".format(data[(data["Target"]==1)].shape[0]))

Inactive Compounds:1126
Active Compounds:304


# Generating additional features (RDkit5 molecular fingerprints)

In [16]:
# generating topological fingerprints for all chemical smiles in the dataframe
def get_tp_fps(df,c):
    topological = []
    
    for i in range(0, len(df)):
        mol = Chem.MolFromSmiles(df.iloc[i,c])
        fp = Chem.RDKFingerprint(mol, maxPath = 5)
        fp_list = np.unique(fp, return_inverse = True)[1].tolist()
        topological.append(fp_list)
        
    return topological

In [12]:
# putting obtained fingerprints into a dataframe
topological = pd.DataFrame(data = get_tp_fps(data, 0))

In [13]:
# combining the molecular descriptors and the fingerprints to a single table
data_all = pd.concat([data.iloc[:,:-1], topological, data.iloc[:,-1]], axis = 1)
display(data_all.head())

Unnamed: 0,Standardised_Smiles,AM1_E,AM1_Eele,AM1_HF,AM1_HOMO,AM1_IP,AM1_LUMO,AM1_dipole,ASA,ASA+,...,2039,2040,2041,2042,2043,2044,2045,2046,2047,Target
0,Oc1c(Cl)cc(Cl)cc1Sc1cc(Cl)cc(Cl)c1O,-91061.594,-490900.38,-54.180599,-9.08072,9.08072,-0.86114,1.984315,493.96982,116.66142,...,0,0,1,0,0,0,0,0,0,0
1,CCCCCC1C(=O)CCC1CC(=O)O,-64010.746,-377504.44,-181.89195,-10.37975,10.37975,0.7611,1.84289,458.93463,320.75201,...,0,0,0,0,0,0,0,0,0,0
2,Cc1cc(=O)n(-c2ccccc2)n1C,-53151.84,-302296.66,64.051399,-8.76044,8.76044,-0.1718,4.432582,392.4848,232.39806,...,0,0,0,0,0,0,1,0,0,0
3,O=C(C=CC=Cc1ccc2c(c1)OCO2)N1CCCCC1,-83131.195,-527420.63,-51.748039,-8.66869,8.66869,-0.80507,2.895411,548.17822,370.79236,...,0,0,0,0,0,0,0,0,1,0
4,O=c1ccc2cc(OC3OC(CO)C(O)C(O)C3O)c(O)cc2o1,-115882.23,-774507.5,-348.08813,-9.29644,9.29644,-1.18028,4.948146,522.75287,310.44724,...,0,0,0,0,0,0,0,0,0,0


In [15]:
# assigning X (input) and Y (output) values
X = data_all.iloc[:,1:-1]
y = data_all.iloc[:,-1]