# Prediction of Placental Barrier Permeability

## Import data

In [1]:
import pandas as pd

In [2]:
df = pd.read_csv("./task2.csv")
df = df.drop('Unnamed: 0',axis=1)

In [3]:
df

Unnamed: 0,Compound,No.,SMILES,Name,CI-Obs.,CI-Cal.,Test
0,Abacavir,1,NC1=NC(NC2CC2)=C2N=CN([C@@H]3C[C@H](CO)C=C3)C2=N1,Abacavir,0.47,0.62,1
1,Acipimox,2,CC1=CN=C(C=[N+]1[O-])C(=O)O,Acipimox,0.25,0.38,0
2,Acyclovir,3,NC1=NC(=O)C2=C(N1)N(COCCO)C=N2,Acyclovir,0.17,0.09,1
3,Alanine,4,CC(N)C(O)=O,Alanine,0.30,0.40,1
4,Alfentanil,5,CCN1N=NN(CCN2CCC(COC)(CC2)N(C(=O)CC)C2=CC=CC=C...,Alfentanil,0.75,0.68,0
...,...,...,...,...,...,...,...
82,Trovafl oxacin,84,O=C(C1=CN(C2=CC=C(F)C=C2F)C3=C(C=C(F)C(N4CC5C(...,Trovafl oxacin,0.19,0.23,0
83,Urea,85,NC(N)=O,Urea,0.32,0.28,0
84,Valproic acid,86,CCCC(CCC)C(O)=O,Valproic_acid,0.95,0.93,0
85,Vinblastine,87,CC[C@]1(O)C[C@@H]2CN(C1)CCC1=C(NC3=CC=CC=C13)[...,Vinblastine,0.31,0.23,0


### Check if there are any null values

In [4]:
df.isnull().sum()

Compound    0
No.         0
SMILES      0
Name        0
CI-Obs.     0
CI-Cal.     0
Test        0
dtype: int64

No need to complement data. 

In [5]:
df.to_csv('data.csv')

## Calculate 1D/2D Descriptors

In [6]:
from rdkit import Chem
from rdkit.Chem import Descriptors
from rdkit.ML.Descriptors import MoleculeDescriptors

In [7]:
names = df.index
mols = [ Chem.MolFromSmiles(mol) for mol in df['SMILES']]
# List of all descriptors
descLists = [desc_name[0] for desc_name in Descriptors.descList]
desc_calc = MoleculeDescriptors.MolecularDescriptorCalculator(descLists)
data = [desc_calc.CalcDescriptors(mol) for mol in mols]
df_desc = pd.DataFrame(data, columns=descLists, index=names)
df_1d2d = pd.concat([df, df_desc], axis=1)

In [8]:
df_1d2d

Unnamed: 0,Compound,No.,SMILES,Name,CI-Obs.,CI-Cal.,Test,MaxEStateIndex,MinEStateIndex,MaxAbsEStateIndex,...,fr_sulfide,fr_sulfonamd,fr_sulfone,fr_term_acetylene,fr_tetrazole,fr_thiazole,fr_thiocyan,fr_thiophene,fr_unbrch_alkane,fr_urea
0,Abacavir,1,NC1=NC(NC2CC2)=C2N=CN([C@@H]3C[C@H](CO)C=C3)C2=N1,Abacavir,0.47,0.62,1,9.257136,0.162245,9.257136,...,0,0,0,0,0,0,0,0,0,0
1,Acipimox,2,CC1=CN=C(C=[N+]1[O-])C(=O)O,Acipimox,0.25,0.38,0,10.757593,-1.199444,10.757593,...,0,0,0,0,0,0,0,0,0,0
2,Acyclovir,3,NC1=NC(=O)C2=C(N1)N(COCCO)C=N2,Acyclovir,0.17,0.09,1,11.375082,-0.479483,11.375082,...,0,0,0,0,0,0,0,0,1,0
3,Alanine,4,CC(N)C(O)=O,Alanine,0.30,0.40,1,9.574074,-0.962963,9.574074,...,0,0,0,0,0,0,0,0,0,0
4,Alfentanil,5,CCN1N=NN(CCN2CCC(COC)(CC2)N(C(=O)CC)C2=CC=CC=C...,Alfentanil,0.75,0.68,0,12.953415,-0.373136,12.953415,...,0,0,0,0,1,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
82,Trovafl oxacin,84,O=C(C1=CN(C2=CC=C(F)C=C2F)C3=C(C=C(F)C(N4CC5C(...,Trovafl oxacin,0.19,0.23,0,14.835016,-1.559007,14.835016,...,0,0,0,0,0,0,0,0,0,0
83,Urea,85,NC(N)=O,Urea,0.32,0.28,0,9.000000,-0.833333,9.000000,...,0,0,0,0,0,0,0,0,0,1
84,Valproic acid,86,CCCC(CCC)C(O)=O,Valproic_acid,0.95,0.93,0,10.479352,-0.635463,10.479352,...,0,0,0,0,0,0,0,0,0,0
85,Vinblastine,87,CC[C@]1(O)C[C@@H]2CN(C1)CCC1=C(NC3=CC=CC=C13)[...,Vinblastine,0.31,0.23,0,15.321994,-2.301098,15.321994,...,0,0,0,0,0,0,0,0,0,0


In [9]:
df_1d2d.to_csv('1d2d_desc.csv')

## Calculate 3D Descriptors using ETKDGv2

In [10]:
from rdkit.Chem import AllChem, Descriptors3D

In [11]:
def smiles_to_conformation(smiles):
    comp = Chem.MolFromSmiles(smiles)
    comp_h = AllChem.AddHs(comp)
    AllChem.EmbedMolecule(comp_h, AllChem.ETKDGv2())
    return comp_h

In [12]:
descLists = ['Asphericity','Eccentricity','InertialShapeFactor','NPR1','NPR2','PMI1','PMI2','PMI3','RadiusOfGyration','SpherocityIndex']
mols = [smiles_to_conformation(smiles) for smiles in df['SMILES']]
df_3d = df
for desc in descLists:
    descCalc = getattr(Descriptors3D, desc)
    data = [descCalc(mol) for mol in mols]
    df_desc = pd.Series(data, name=desc)
    df_3d = pd.concat([df_3d, df_desc], axis=1)

In [13]:
df_3d

Unnamed: 0,Compound,No.,SMILES,Name,CI-Obs.,CI-Cal.,Test,Asphericity,Eccentricity,InertialShapeFactor,NPR1,NPR2,PMI1,PMI2,PMI3,RadiusOfGyration,SpherocityIndex
0,Abacavir,1,NC1=NC(NC2CC2)=C2N=CN([C@@H]3C[C@H](CO)C=C3)C2=N1,Abacavir,0.47,0.62,1,0.459433,0.972642,0.001016,0.232309,0.816769,804.181949,2827.395608,3461.684604,3.519393,0.117582
1,Acipimox,2,CC1=CN=C(C=[N+]1[O-])C(=O)O,Acipimox,0.25,0.38,0,0.486427,0.975743,0.004011,0.218921,0.785309,195.792700,702.345288,894.354968,2.411444,0.044647
2,Acyclovir,3,NC1=NC(=O)C2=C(N1)N(COCCO)C=N2,Acyclovir,0.17,0.09,1,0.452947,0.971415,0.001497,0.237387,0.790807,528.335986,1760.045835,2225.632593,3.165735,0.097387
3,Alanine,4,CC(N)C(O)=O,Alanine,0.30,0.40,1,0.209424,0.897321,0.006688,0.441379,0.680275,101.712047,156.763556,230.441581,1.656450,0.426039
4,Alfentanil,5,CCN1N=NN(CCN2CCC(COC)(CC2)N(C(=O)CC)C2=CC=CC=C...,Alfentanil,0.75,0.68,0,0.620307,0.988818,0.000690,0.149130,0.943495,1366.631793,8646.199827,9164.016541,4.797915,0.195626
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
82,Trovafl oxacin,84,O=C(C1=CN(C2=CC=C(F)C=C2F)C3=C(C=C(F)C(N4CC5C(...,Trovafl oxacin,0.19,0.23,0,0.267773,0.922921,0.000254,0.384989,0.674704,2652.815286,4649.134882,6890.633865,4.128400,0.129791
83,Urea,85,NC(N)=O,Urea,0.32,0.28,0,0.216881,0.873886,0.012025,0.486132,0.567849,47.221287,55.159014,97.136848,1.288834,0.309194
84,Valproic acid,86,CCCC(CCC)C(O)=O,Valproic_acid,0.95,0.93,0,0.224514,0.909743,0.002160,0.415172,0.723711,335.102250,584.135966,807.139855,2.446523,0.291450
85,Vinblastine,87,CC[C@]1(O)C[C@@H]2CN(C1)CCC1=C(NC3=CC=CC=C13)[...,Vinblastine,0.31,0.23,0,0.389110,0.961271,0.000151,0.275605,0.876644,5807.881486,18473.711909,21073.224875,5.287973,0.264433


In [14]:
df_3d.to_csv('3d_desc.csv')

## Calculate ECFP4 Fingerprint

In [15]:
from rdkit import DataStructs
import numpy as np

mols = [smiles_to_conformation(smiles) for smiles in df['SMILES']]
fp = np.vstack(list(map(lambda mol: AllChem.GetMorganFingerprintAsBitVect(mol, radius=2, nBits=2048), mols)))
df_fp = pd.concat([df, pd.DataFrame(fp)], axis=1)
df_fp

Unnamed: 0,Compound,No.,SMILES,Name,CI-Obs.,CI-Cal.,Test,0,1,2,...,2038,2039,2040,2041,2042,2043,2044,2045,2046,2047
0,Abacavir,1,NC1=NC(NC2CC2)=C2N=CN([C@@H]3C[C@H](CO)C=C3)C2=N1,Abacavir,0.47,0.62,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,Acipimox,2,CC1=CN=C(C=[N+]1[O-])C(=O)O,Acipimox,0.25,0.38,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,Acyclovir,3,NC1=NC(=O)C2=C(N1)N(COCCO)C=N2,Acyclovir,0.17,0.09,1,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,Alanine,4,CC(N)C(O)=O,Alanine,0.30,0.40,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,Alfentanil,5,CCN1N=NN(CCN2CCC(COC)(CC2)N(C(=O)CC)C2=CC=CC=C...,Alfentanil,0.75,0.68,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
82,Trovafl oxacin,84,O=C(C1=CN(C2=CC=C(F)C=C2F)C3=C(C=C(F)C(N4CC5C(...,Trovafl oxacin,0.19,0.23,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
83,Urea,85,NC(N)=O,Urea,0.32,0.28,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
84,Valproic acid,86,CCCC(CCC)C(O)=O,Valproic_acid,0.95,0.93,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
85,Vinblastine,87,CC[C@]1(O)C[C@@H]2CN(C1)CCC1=C(NC3=CC=CC=C13)[...,Vinblastine,0.31,0.23,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0


In [16]:
df_fp.to_csv('fp.csv')