In [1]:
#importing all needed modules
import pandas as pd
from mordred import Calculator, descriptors
from rdkit import Chem

In [2]:
#opening twosides file as df
twosides_df = pd.read_csv("twosides.csv").drop(columns = "Unnamed: 0")

In [3]:
#getting the list for drug1
drug_list_for_drug_1 = list(twosides_df["Drug1"].unique())

In [4]:
#initiating mordred calculator
calc = Calculator(descriptors, ignore_3D=True)

In [5]:
#building drug features for drug1
mols_for_drug_1 = [Chem.MolFromSmiles(item) for item in drug_list_for_drug_1]
drug1_features = calc.pandas(mols_for_drug_1)

  9%|████████                                                                              | 58/616 [00:11<01:50,  5.06it/s]

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


 13%|███████████▌                                                                          | 83/616 [00:15<01:36,  5.51it/s]

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


 16%|█████████████▋                                                                        | 98/616 [00:16<01:14,  6.92it/s]

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


 18%|███████████████▏                                                                     | 110/616 [00:18<01:11,  7.05it/s]

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


 19%|████████████████▌                                                                    | 120/616 [00:21<01:28,  5.61it/s]

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


 54%|██████████████████████████████████████████████                                       | 334/616 [00:56<01:16,  3.67it/s]

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


100%|█████████████████████████████████████████████████████████████████████████████████████| 616/616 [01:45<00:00,  5.81it/s]


  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


In [6]:
#drug1_features dataframe lacks the SMILE for drug1 itself so add in the SMILE
drug1_features["Drug1"] = drug_list_for_drug_1

In [7]:
#shifting SMILE structure for drug1 to the front so it is easier to visualize
drug1_features = drug1_features.set_index("Drug1").reset_index()

In [8]:
#building the list of columns to drop by selecting dtypes = "object", as we only want the float/int dtypes
#the list doesn't include "Drug1" column as we need the SMILE structures for merging
drug1_features_object_list = list(drug1_features.select_dtypes("object").keys()[1:])

In [9]:
#dropping the columns where dtypes = "objects", except the "Drug1" column where it contains the SMILE structure
drug1_features = drug1_features.drop(columns = drug1_features_object_list)

In [10]:
#visualizing built drug1 features
drug1_features

Unnamed: 0,Drug1,ABC,ABCGG,nAcid,nBase,nAromAtom,nAromBond,nAtom,nHeavyAtom,nSpiro,...,SRW09,SRW10,TSRW10,MW,AMW,WPath,WPol,Zagreb1,Zagreb2,mZagreb2
0,CC1(C(N2C(S1)C(C2=O)NC(=O)C(C3=CC=CC=C3)N)C(=O...,18.864432,16.281312,1,1,6,6,42,24,0,...,7.412764,10.512519,74.507020,348.102351,8.288151,1365,39,132.0,161.0,5.097222
1,C(OC(C(F)(F)F)C(F)(F)F)F,8.608467,9.107960,0,0,0,0,15,12,0,...,0.000000,9.352100,42.729795,200.007212,13.333814,188,15,56.0,60.0,2.583333
2,CC(=O)NCC1CN(C(=O)O1)C2=CC(=C(C=C2)N3CCOCC3)F,18.660575,14.806816,0,0,6,6,44,24,0,...,6.803505,9.996659,72.324742,337.143784,7.662359,1467,35,124.0,144.0,5.277778
3,CC(C1=CC2=C(C=C1)C=C(C=C2)OC)C(=O)O,12.934771,10.892176,1,0,10,11,31,17,0,...,0.000000,9.700575,49.715135,230.094294,7.422397,530,26,86.0,100.0,3.833333
4,CC(=O)SC1CC2=CC(=O)CCC2(C3C1C4CCC5(C4(CC3)C)CC...,23.689037,19.021943,0,0,0,0,61,29,1,...,7.758333,10.867940,82.397607,416.202131,6.822986,1905,61,174.0,221.0,5.937500
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
611,COC1C(C(C(C(O1)COS(=O)(=O)O)OC2C(C(C(C(O2)C(=O...,71.193407,55.333032,10,0,0,0,144,91,0,...,0.000000,11.559437,132.593611,1506.951333,10.464940,46698,154,490.0,564.0,18.527778
612,C1C2=C(C=CC(=C2Cl)Cl)NC3=NC(=O)CN31,12.894331,10.810899,0,3,6,6,23,16,0,...,6.803505,9.899329,63.147309,254.996617,11.086809,399,26,90.0,109.0,3.361111
613,CCCCN1CC(C(C(C1CO)O)O)O,10.773011,10.341551,0,1,0,0,36,15,0,...,0.000000,9.511999,46.699740,219.147058,6.087418,362,23,70.0,81.0,3.611111
614,[N]=O,0.000000,0.000000,0,0,0,0,2,2,0,...,0.000000,1.098612,7.493061,29.997989,14.998994,1,0,2.0,1.0,1.000000


In [11]:
#converting the file to csv
drug1_features.to_csv("cxe_feat_eng_drug1_droppedcolumns.csv")

In [12]:
#getting the list for drug2
drug_list_for_drug_2 = list(twosides_df["Drug2"].unique())

In [13]:
#building drug features for drug2
mols_for_drug_2 = [Chem.MolFromSmiles(item) for item in drug_list_for_drug_2]
drug2_features = calc.pandas(mols_for_drug_2)

  5%|████▏                                                                                 | 31/634 [00:08<01:41,  5.92it/s]

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


  9%|███████▊                                                                              | 58/634 [00:18<05:24,  1.77it/s]

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


 22%|███████████████████                                                                  | 142/634 [00:35<02:22,  3.45it/s]

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


 24%|████████████████████▋                                                                | 154/634 [00:40<03:00,  2.66it/s]

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


 41%|██████████████████████████████████▉                                                  | 261/634 [01:01<01:43,  3.61it/s]

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


 47%|████████████████████████████████████████▏                                            | 300/634 [01:11<01:26,  3.85it/s]

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


 81%|████████████████████████████████████████████████████████████████████▊                | 513/634 [02:02<00:29,  4.08it/s]

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


 92%|██████████████████████████████████████████████████████████████████████████████▍      | 585/634 [02:16<00:07,  6.38it/s]

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


100%|█████████████████████████████████████████████████████████████████████████████████████| 634/634 [02:23<00:00,  4.41it/s]


In [14]:
#drug2_features dataframe lacks the SMILE for drug1 itself so add in the SMILE
drug2_features["Drug2"] = drug_list_for_drug_2

In [15]:
#shifting SMILE structure for drug1 to the front so it is easier to visualize
drug2_features = drug2_features.set_index("Drug2").reset_index()

In [16]:
#building the list of columns to drop by selecting dtypes = "object", as we only want the float/int dtypes
#the list doesn't include "Drug1" column as we need the SMILE structures for merging
drug2_features_object_list = list(drug2_features.select_dtypes("object").keys()[1:])

In [17]:
#dropping the columns where dtypes = "objects", except the "Drug1" column where it contains the SMILE structure
drug2_features = drug2_features.drop(columns = drug2_features_object_list)

In [18]:
#visualizing built drug1 features
drug2_features

Unnamed: 0,Drug2,ABC,ABCGG,nAcid,nBase,nAromAtom,nAromBond,nAtom,nHeavyAtom,nSpiro,...,SRW09,SRW10,TSRW10,MW,AMW,WPath,WPol,Zagreb1,Zagreb2,mZagreb2
0,CCC(=O)N(C1CCN(CC1)CCC2=CC=CC=C2)C3=CC=CC=C3,19.079953,14.714219,0,1,12,12,53,25,0,...,0.000000,9.921082,59.262800,336.220164,6.343777,1680,36,124.0,142.0,5.750000
1,CN1C2=C(C(=O)N(C1=O)C)NC=N2.CN1C2=C(C(=O)N(C1=...,22.310147,21.858687,0,2,18,20,54,30,0,...,7.321850,10.408587,81.389222,420.198199,7.781448,27300000432,45,150.0,180.0,6.916667
2,CC1C2C(C(=O)N2C(=C1SC3CC(NC3)C(=O)NC4=CC=CC(=C...,25.955071,21.265074,2,1,6,6,58,33,0,...,7.773594,10.716926,86.310340,475.141321,8.192092,3464,54,180.0,219.0,7.083333
3,C1=NC(=NN1C2C(C(C(O2)CO)O)O)C(=O)N,12.963281,12.248253,0,0,5,5,29,17,0,...,7.515889,9.696279,65.758522,244.080769,8.416578,515,25,88.0,105.0,3.805556
4,CC1CC2C(CCC3(C2CCC3(C(=O)C)O)C)C4(C1=CC(=O)CC4)C,20.250812,16.665596,0,0,0,0,57,25,0,...,7.247793,10.791111,76.066825,344.235145,6.039213,1276,57,150.0,193.0,5.104167
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
629,C1=CN=C(C=N1)C(=O)N,6.542301,6.236096,0,0,6,6,14,9,0,...,0.000000,8.590258,37.289972,123.043262,8.788804,88,9,40.0,43.0,2.111111
630,C(C1C(C(C(C(O1)O)O)O)O)O,8.761080,8.651650,0,0,0,0,24,12,0,...,0.000000,9.396405,42.927926,180.063388,7.502641,182,19,58.0,68.0,2.777778
631,CC(C(=O)C1=CC(=CC=C1)Cl)NC(C)(C)C,12.037754,11.142483,0,1,6,6,34,16,0,...,0.000000,9.470626,47.991812,239.107692,7.032579,454,20,78.0,85.0,3.430556
632,CNCCCN1C2=CC=CC=C2CCC3=CC=CC=C31,15.394589,12.442067,0,1,12,12,42,20,0,...,5.379897,9.832797,61.559435,266.178299,6.337579,759,34,102.0,120.0,4.611111


In [19]:
#converting the file to csv
drug2_features.to_csv("cxe_feat_eng_drug2_droppedcolumns.csv")