In [33]:
import pandas as pd
import rdkit

In [34]:
cancer_name = 'breast_cancer'
assays_id = 639
assays_csv = f'../data/raw/{cancer_name}/{assays_id}/{assays_id}.csv'
assays_sdf = f'../data/raw/{cancer_name}/{assays_id}/{assays_id}.sdf'

In [35]:
df_labels = pd.read_csv(assays_csv)

  interactivity=interactivity, compiler=compiler, result=result)


In [36]:
df_labels = df_labels.drop([0, 1, 2])
df_labels.head(10)

Unnamed: 0,PUBCHEM_RESULT_TAG,PUBCHEM_SID,PUBCHEM_CID,PUBCHEM_ACTIVITY_OUTCOME,PUBCHEM_ACTIVITY_SCORE,PUBCHEM_ACTIVITY_URL,PUBCHEM_ASSAYDATA_COMMENT,% of Activity,F665,F620,FRET Signal,Comments
3,1,4257611.0,800354.0,Inactive,98.0,,,98.04,400,8898,450,no comment
4,2,4261287.0,2948508.0,Inactive,100.0,,,100.07,347,7565,459,no comment
5,3,4256064.0,690911.0,Inactive,75.0,,,74.64,170,4917,346,Possible Fluorescence Artifact
6,4,4262821.0,1293242.0,Inactive,86.0,,,86.34,323,8113,398,no comment
7,5,4262380.0,2972431.0,Inactive,93.0,,,93.09,365,8535,428,no comment
8,6,4255054.0,976214.0,Inactive,79.0,,,79.36,338,9212,367,Possible Fluorescence Artifact
9,7,4256335.0,977640.0,Inactive,101.0,,,100.52,363,7873,461,no comment
10,8,4257172.0,2198533.0,Inactive,93.0,,,93.32,397,9250,429,Possible Fluorescence Artifact
11,9,4263339.0,2914925.0,Inactive,106.0,,,106.14,423,8708,486,no comment
12,10,4259448.0,976855.0,Inactive,97.0,,,96.69,397,8935,444,Possible Fluorescence Artifact


In [37]:
df_labels = df_labels[['PUBCHEM_SID', 'PUBCHEM_ACTIVITY_OUTCOME']]

In [38]:
df_labels.head(10)

Unnamed: 0,PUBCHEM_SID,PUBCHEM_ACTIVITY_OUTCOME
3,4257611.0,Inactive
4,4261287.0,Inactive
5,4256064.0,Inactive
6,4262821.0,Inactive
7,4262380.0,Inactive
8,4255054.0,Inactive
9,4256335.0,Inactive
10,4257172.0,Inactive
11,4263339.0,Inactive
12,4259448.0,Inactive


In [39]:
df_labels['ACTIVITY'] = (df_labels['PUBCHEM_ACTIVITY_OUTCOME'] =='Active').astype(float)

In [40]:
df_labels

Unnamed: 0,PUBCHEM_SID,PUBCHEM_ACTIVITY_OUTCOME,ACTIVITY
3,4257611.0,Inactive,0.0
4,4261287.0,Inactive,0.0
5,4256064.0,Inactive,0.0
6,4262821.0,Inactive,0.0
7,4262380.0,Inactive,0.0
...,...,...,...
86104,14743395.0,Inactive,0.0
86105,14735582.0,Inactive,0.0
86106,14738043.0,Inactive,0.0
86107,14735632.0,Inactive,0.0


In [41]:
df_labels.query('ACTIVITY == True')

Unnamed: 0,PUBCHEM_SID,PUBCHEM_ACTIVITY_OUTCOME,ACTIVITY
39,3714184.0,Active,1.0
48,4261874.0,Active,1.0
80,4245703.0,Active,1.0
96,862513.0,Active,1.0
102,3717174.0,Active,1.0
...,...,...,...
85855,14730230.0,Active,1.0
85864,14742851.0,Active,1.0
85928,14733070.0,Active,1.0
85930,14741429.0,Active,1.0


In [42]:
from rdkit import Chem
from rdkit.Chem import Descriptors

In [53]:
mols = Chem.SDMolSupplier(assays_sdf)
df_features = pd.DataFrame()
for mol in mols:
    if mol == None:
        break
    mol_features={}
    mol_features['PUBCHEM_SID']=mol.GetPropsAsDict()['PUBCHEM_SUBSTANCE_ID']
    mol_features['min_abs_partial_charge'] = Descriptors.MinAbsPartialCharge(mol)
    mol_features['tpsa'] = Descriptors.TPSA(mol) 
    mol_features['exact_mol_wt'] = Descriptors.ExactMolWt(mol)
    mol_features['max_abs_partial_charge'] = Descriptors.MaxAbsPartialCharge(mol) 
    mol_features['num_radical_eletrons'] = Descriptors.NumRadicalElectrons(mol)
    mol_features['mol_log_p'] = Descriptors.MolLogP(mol)
    mol_features['mol_mr'] = Descriptors.MolMR(mol)
    mol_features['mol_wt'] = Descriptors.MolWt(mol)
    mol_features['heavy_atom_count'] = Descriptors.HeavyAtomCount(mol)
    mol_features['heavy_atom_mol_wt'] = Descriptors.HeavyAtomMolWt(mol)
    mol_features['nhoh_count'] = Descriptors.NHOHCount(mol)
    mol_features['no_count'] = Descriptors.NOCount(mol)
    mol_features['num_h_acceptors'] = Descriptors.NumHAcceptors(mol)
    mol_features['num_h_donors'] = Descriptors.NumHDonors(mol)
    mol_features['num_hetero_atoms'] = Descriptors.NumHeteroatoms(mol)
    mol_features['num_rotatable_bonds'] = Descriptors.NumRotatableBonds(mol)
    mol_features['num_valence_electrons'] = Descriptors.NumValenceElectrons(mol)
    mol_features['balabanj'] = rdkit.Chem.GraphDescriptors.BalabanJ(mol)
    mol_features['bertzct'] = rdkit.Chem.GraphDescriptors.BertzCT(mol)
    mol_features['ipc'] = rdkit.Chem.GraphDescriptors.Ipc(mol)
    mol_features['chi0'] = rdkit.Chem.GraphDescriptors.Chi0(mol)
    mol_features['chi1'] = rdkit.Chem.GraphDescriptors.Chi1(mol)
    mol_features['kappa1'] = rdkit.Chem.GraphDescriptors.Kappa1(mol)
    mol_features['hallkier_alpha'] = rdkit.Chem.GraphDescriptors.HallKierAlpha(mol)
    df_features = df_features.append(mol_features, ignore_index=True)

In [54]:
df_features.head(10)

Unnamed: 0,PUBCHEM_SID,balabanj,bertzct,chi0,chi1,exact_mol_wt,hallkier_alpha,heavy_atom_count,heavy_atom_mol_wt,ipc,...,mol_wt,nhoh_count,no_count,num_h_acceptors,num_h_donors,num_hetero_atoms,num_radical_eletrons,num_rotatable_bonds,num_valence_electrons,tpsa
0,842121.0,4.839995e-06,455.575868,16.424439,10.403997,356.111455,-1.51,23.0,336.612,51594.9,...,356.772,2.0,5.0,4.0,2.0,9.0,0.0,9.0,132.0,59.59
1,842122.0,2.083331e-06,583.676655,15.907567,11.347668,351.182588,-1.5,24.0,325.674,321981.1,...,351.882,0.0,6.0,6.0,0.0,7.0,0.0,7.0,132.0,56.07
2,842123.0,2.80333e-06,763.246281,19.493353,12.815628,421.149237,-1.75,28.0,398.667,1259164.0,...,421.851,1.0,7.0,7.0,1.0,11.0,0.0,6.0,156.0,76.3
3,842124.0,4.874999e-07,571.749178,19.363597,12.379918,368.17475,-2.48,26.0,343.205,452447.5,...,368.405,3.0,7.0,4.0,3.0,8.0,0.0,5.0,144.0,106.94
4,842125.0,2.083332e-06,639.421293,16.070703,11.292025,347.128821,-1.88,24.0,325.666,270761.5,...,347.842,0.0,4.0,4.0,0.0,5.0,0.0,5.0,128.0,38.77
5,842126.0,1.999998e-06,427.38986,13.501789,9.292025,301.108086,-1.46,20.0,281.61,25339.27,...,301.77,1.0,5.0,5.0,1.0,6.0,0.0,6.0,112.0,56.79
6,842127.0,2.879997e-06,551.124102,16.65649,11.079719,419.061131,-1.31,24.0,397.551,145021.6,...,420.735,1.0,6.0,4.0,1.0,8.0,0.0,5.0,134.0,61.88
7,842128.0,1.763332e-06,507.068437,14.872033,10.132664,323.165207,-1.06,22.0,297.656,76179.18,...,323.864,0.0,3.0,3.0,0.0,4.0,0.0,2.0,122.0,29.54
8,842129.0,7.538888e-07,383.283806,18.535169,11.272963,348.189651,-1.71,24.0,320.172,104663.8,...,348.396,3.0,9.0,6.0,3.0,9.0,0.0,7.0,140.0,127.61
9,842130.0,1.919998e-06,493.350894,15.416003,10.711944,336.196841,-1.06,23.0,307.675,204669.3,...,336.907,1.0,3.0,2.0,1.0,4.0,0.0,5.0,128.0,32.34


In [55]:
df_joined = pd.merge(df_features, df_labels, on = ['PUBCHEM_SID'] )

In [56]:
df_joined.head(10)

Unnamed: 0,PUBCHEM_SID,balabanj,bertzct,chi0,chi1,exact_mol_wt,hallkier_alpha,heavy_atom_count,heavy_atom_mol_wt,ipc,...,no_count,num_h_acceptors,num_h_donors,num_hetero_atoms,num_radical_eletrons,num_rotatable_bonds,num_valence_electrons,tpsa,PUBCHEM_ACTIVITY_OUTCOME,ACTIVITY
0,842121.0,4.839995e-06,455.575868,16.424439,10.403997,356.111455,-1.51,23.0,336.612,51594.9,...,5.0,4.0,2.0,9.0,0.0,9.0,132.0,59.59,Inactive,0.0
1,842122.0,2.083331e-06,583.676655,15.907567,11.347668,351.182588,-1.5,24.0,325.674,321981.1,...,6.0,6.0,0.0,7.0,0.0,7.0,132.0,56.07,Inactive,0.0
2,842123.0,2.80333e-06,763.246281,19.493353,12.815628,421.149237,-1.75,28.0,398.667,1259164.0,...,7.0,7.0,1.0,11.0,0.0,6.0,156.0,76.3,Inactive,0.0
3,842124.0,4.874999e-07,571.749178,19.363597,12.379918,368.17475,-2.48,26.0,343.205,452447.5,...,7.0,4.0,3.0,8.0,0.0,5.0,144.0,106.94,Inactive,0.0
4,842125.0,2.083332e-06,639.421293,16.070703,11.292025,347.128821,-1.88,24.0,325.666,270761.5,...,4.0,4.0,0.0,5.0,0.0,5.0,128.0,38.77,Inactive,0.0
5,842126.0,1.999998e-06,427.38986,13.501789,9.292025,301.108086,-1.46,20.0,281.61,25339.27,...,5.0,5.0,1.0,6.0,0.0,6.0,112.0,56.79,Inactive,0.0
6,842127.0,2.879997e-06,551.124102,16.65649,11.079719,419.061131,-1.31,24.0,397.551,145021.6,...,6.0,4.0,1.0,8.0,0.0,5.0,134.0,61.88,Inactive,0.0
7,842128.0,1.763332e-06,507.068437,14.872033,10.132664,323.165207,-1.06,22.0,297.656,76179.18,...,3.0,3.0,0.0,4.0,0.0,2.0,122.0,29.54,Inactive,0.0
8,842129.0,7.538888e-07,383.283806,18.535169,11.272963,348.189651,-1.71,24.0,320.172,104663.8,...,9.0,6.0,3.0,9.0,0.0,7.0,140.0,127.61,Inactive,0.0
9,842130.0,1.919998e-06,493.350894,15.416003,10.711944,336.196841,-1.06,23.0,307.675,204669.3,...,3.0,2.0,1.0,4.0,0.0,5.0,128.0,32.34,Inactive,0.0
