# Prediction of human fatal-maternal blood concentration ratio of chemicals

## Import data

In [1]:
import pandas as pd

In [2]:
train_df = pd.read_csv("table-1.csv")
test_df = pd.read_csv("table-2.csv")

In [3]:
train_df.head()

Unnamed: 0,Name,AATSC1c,ZMIC1,Observed logFM,Predicted logFM,AD,PubChem CID
0,Oxychlordane,0.9,1.9,-1.02,-0.8,Y,33772
1,DDE,1.06,1.1,-0.98,-0.64,Y,3035
2,Mifepristone,1.1,-0.03,-0.96,-0.41,N,4196
3,Atazanavir,-0.36,1.94,-0.89,-0.7,Y,148192
4,Nonachlor,1.54,2.54,-0.84,-1.0,Y,19520


In [4]:
test_df.head()

Unnamed: 0,Name,AATSC1c,ZMIC1,Observed logFM,Predicted logFM,AD,PubChem CID
0,Indinavir,0.44,1.54,-1.1,-0.68,Y,3706
1,Duloxetine,1.13,0.04,-0.92,-0.43,N,60835
2,17-Hydroxyprogesterone caproate,0.9,0.71,-0.7,-0.55,Y,3653
3,Nelfinavir,0.34,0.82,-0.6,-0.52,Y,4451
4,Bupivacaine,0.91,-0.38,-0.52,-0.31,Y,2474


In [5]:
df = pd.concat([train_df, test_df])
df = df.loc[:, ['Name', 'PubChem CID', 'Observed logFM', 'Predicted logFM']]

In [6]:
df.head()

Unnamed: 0,Name,PubChem CID,Observed logFM,Predicted logFM
0,Oxychlordane,33772,-1.02,-0.8
1,DDE,3035,-0.98,-0.64
2,Mifepristone,4196,-0.96,-0.41
3,Atazanavir,148192,-0.89,-0.7
4,Nonachlor,19520,-0.84,-1.0


In [7]:
smiles = pd.read_table("task1-smiles.txt", header=None, names=('PubChem CID', 'SMILES'))

In [8]:
smiles.head()

Unnamed: 0,PubChem CID,SMILES
0,3653,CCCCCC(=O)OC1(CCC2C1(CCC3C2CCC4=CC(=O)CCC34C)C...
1,1971,C1CC1NC2=C3C(=NC(=N2)N)N(C=N3)C4CC(C=C4)CO
2,1978,CCCC(=O)NC1=CC(=C(C=C1)OCC(CNC(C)C)O)C(=O)C
3,2476,CC(C)(C)C(C)(C1CC23CCC1(C4C25CCN(C3CC6=C5C(=C(...
4,10917,C[N+](C)(C)C[C@@H](CC(=O)[O-])O


In [9]:
df = pd.merge(df, smiles, on='PubChem CID', how='outer')

### Check if there are any null values

In [10]:
df.isnull().sum()

Name               0
PubChem CID        0
Observed logFM     0
Predicted logFM    0
SMILES             0
dtype: int64

No need to complement data. 

In [11]:
df.head()

Unnamed: 0,Name,PubChem CID,Observed logFM,Predicted logFM,SMILES
0,Oxychlordane,33772,-1.02,-0.8,C12C(C(C3(C1O3)Cl)Cl)C4(C(=C(C2(C4(Cl)Cl)Cl)Cl...
1,DDE,3035,-0.98,-0.64,C1=CC(=CC=C1C(=C(Cl)Cl)C2=CC=C(C=C2)Cl)Cl
2,Mifepristone,4196,-0.96,-0.41,CC#CC1(CCC2C1(CC(C3=C4CCC(=O)C=C4CCC23)C5=CC=C...
3,Atazanavir,148192,-0.89,-0.7,CC(C)(C)[C@@H](C(=O)N[C@@H](CC1=CC=CC=C1)[C@H]...
4,Nonachlor,19520,-0.84,-1.0,C12C(C(C(C1Cl)Cl)Cl)C3(C(=C(C2(C3(Cl)Cl)Cl)Cl)...


In [12]:
df.to_csv('data.csv')

## Calculate 1D/2D Descriptors

In [13]:
from rdkit import Chem
from rdkit.Chem import Descriptors
from rdkit.ML.Descriptors import MoleculeDescriptors

In [14]:
names = df.index
mols = [ Chem.MolFromSmiles(mol) for mol in df['SMILES']]
# List of all descriptors
descLists = [desc_name[0] for desc_name in Descriptors.descList]
desc_calc = MoleculeDescriptors.MolecularDescriptorCalculator(descLists)
data = [desc_calc.CalcDescriptors(mol) for mol in mols]
df_desc = pd.DataFrame(data, columns=descLists, index=names)
df_1d2d = pd.concat([df, df_desc], axis=1)

In [15]:
df_1d2d.head()

Unnamed: 0,Name,PubChem CID,Observed logFM,Predicted logFM,SMILES,MaxEStateIndex,MinEStateIndex,MaxAbsEStateIndex,MinAbsEStateIndex,qed,...,fr_sulfide,fr_sulfonamd,fr_sulfone,fr_term_acetylene,fr_tetrazole,fr_thiazole,fr_thiocyan,fr_thiophene,fr_unbrch_alkane,fr_urea
0,Oxychlordane,33772,-1.02,-0.8,C12C(C(C3(C1O3)Cl)Cl)C4(C(=C(C2(C4(Cl)Cl)Cl)Cl...,6.653356,-1.579444,6.653356,0.14287,0.393598,...,0,0,0,0,0,0,0,0,0,0
1,DDE,3035,-0.98,-0.64,C1=CC(=CC=C1C(=C(Cl)Cl)C2=CC=C(C=C2)Cl)Cl,5.971502,0.208005,5.971502,0.208005,0.619326,...,0,0,0,0,0,0,0,0,0,0
2,Mifepristone,4196,-0.96,-0.41,CC#CC1(CCC2C1(CC(C3=C4CCC(=O)C=C4CCC23)C5=CC=C...,12.169555,-0.913604,12.169555,0.220906,0.639526,...,0,0,0,0,0,0,0,0,0,0
3,Atazanavir,148192,-0.89,-0.7,CC(C)(C)[C@@H](C(=O)N[C@@H](CC1=CC=CC=C1)[C@H]...,13.797319,-1.219772,13.797319,0.119827,0.154312,...,0,0,0,0,0,0,0,0,0,0
4,Nonachlor,19520,-0.84,-1.0,C12C(C(C(C1Cl)Cl)Cl)C3(C(=C(C2(C3(Cl)Cl)Cl)Cl)...,6.625579,-1.607222,6.625579,0.122562,0.408518,...,0,0,0,0,0,0,0,0,0,0


In [16]:
df_1d2d.to_csv('1d2d_desc.csv')

## Calculate 3D Descriptors using ETKDGv2

In [17]:
from rdkit.Chem import AllChem, Descriptors3D

In [18]:
def smiles_to_conformation(smiles):
    comp = Chem.MolFromSmiles(smiles)
    comp_h = AllChem.AddHs(comp)
    AllChem.EmbedMolecule(comp_h, AllChem.ETKDGv2())
    return comp_h

In [19]:
descLists = ['Asphericity','Eccentricity','InertialShapeFactor','NPR1','NPR2','PMI1','PMI2','PMI3','RadiusOfGyration','SpherocityIndex']
mols = [smiles_to_conformation(smiles) for smiles in df['SMILES']]
df_3d = df
for desc in descLists:
    descCalc = getattr(Descriptors3D, desc)
    data = [descCalc(mol) for mol in mols]
    df_desc = pd.Series(data, name=desc)
    df_3d = pd.concat([df_3d, df_desc], axis=1)

In [20]:
df_3d.head()

Unnamed: 0,Name,PubChem CID,Observed logFM,Predicted logFM,SMILES,Asphericity,Eccentricity,InertialShapeFactor,NPR1,NPR2,PMI1,PMI2,PMI3,RadiusOfGyration,SpherocityIndex
0,Oxychlordane,33772,-1.02,-0.8,C12C(C(C3(C1O3)Cl)Cl)C4(C(=C(C2(C4(Cl)Cl)Cl)Cl...,0.124035,0.849923,0.000542,0.526907,0.839296,1547.953805,2465.693952,2937.813664,2.863918,0.652447
1,DDE,3035,-0.98,-0.64,C1=CC(=CC=C1C(=C(Cl)Cl)C2=CC=C(C=C2)Cl)Cl,0.358764,0.953404,0.000564,0.301695,0.738898,1310.388026,3209.340032,4343.414771,3.732887,0.245965
2,Mifepristone,4196,-0.96,-0.41,CC#CC1(CCC2C1(CC(C3=C4CCC(=O)C=C4CCC23)C5=CC=C...,0.18307,0.872762,0.000197,0.488146,0.640115,3254.022601,4267.066604,6666.088659,4.063487,0.256209
3,Atazanavir,148192,-0.89,-0.7,CC(C)(C)[C@@H](C(=O)N[C@@H](CC1=CC=CC=C1)[C@H]...,0.210077,0.900806,7.6e-05,0.434221,0.704534,9264.157011,15031.319416,21335.124767,5.689301,0.259332
4,Nonachlor,19520,-0.84,-1.0,C12C(C(C(C1Cl)Cl)Cl)C3(C(=C(C2(C3(Cl)Cl)Cl)Cl)...,0.049676,0.733965,0.000402,0.679187,0.820061,2042.082998,2465.639941,3006.655892,2.908232,0.733779


In [21]:
df_3d.to_csv('3d_desc.csv')

## Calculate ECFP4 Fingerprint

In [22]:
from rdkit import DataStructs
import numpy as np

mols = [smiles_to_conformation(smiles) for smiles in df['SMILES']]
fp = np.vstack(list(map(lambda mol: AllChem.GetMorganFingerprintAsBitVect(mol, radius=2, nBits=2048), mols)))
df_fp = pd.concat([df, pd.DataFrame(fp)], axis=1)
df_fp.head()

Unnamed: 0,Name,PubChem CID,Observed logFM,Predicted logFM,SMILES,0,1,2,3,4,...,2038,2039,2040,2041,2042,2043,2044,2045,2046,2047
0,Oxychlordane,33772,-1.02,-0.8,C12C(C(C3(C1O3)Cl)Cl)C4(C(=C(C2(C4(Cl)Cl)Cl)Cl...,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,DDE,3035,-0.98,-0.64,C1=CC(=CC=C1C(=C(Cl)Cl)C2=CC=C(C=C2)Cl)Cl,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,Mifepristone,4196,-0.96,-0.41,CC#CC1(CCC2C1(CC(C3=C4CCC(=O)C=C4CCC23)C5=CC=C...,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,Atazanavir,148192,-0.89,-0.7,CC(C)(C)[C@@H](C(=O)N[C@@H](CC1=CC=CC=C1)[C@H]...,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
4,Nonachlor,19520,-0.84,-1.0,C12C(C(C(C1Cl)Cl)Cl)C3(C(=C(C2(C3(Cl)Cl)Cl)Cl)...,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [23]:
df_fp.to_csv('fp.csv')