## Dataset Preparation and Preprocessing

#### Import Packages

In [1]:
import pandas as pd
import numpy as np

# Importing rdkit to extract chemical features of the drugs
from rdkit import Chem
from rdkit.Chem import Descriptors, Lipinski

#### Define smiles, oneil_dataset dataframes

In [2]:
smiles = pd.read_csv('Datasets/smile.csv')
smiles.head()

Unnamed: 0,drug_name,smiles
0,5-Fluorouracil,O=c1[nH]cc(F)c(=O)[nH]1
1,Veliparib,CC1(CCCN1)C2=NC3=C(C=CC=C3N2)C(=O)N
2,MK-1775,CC(C)(C1=NC(=CC=C1)N2C3=NC(=NC=C3C(=O)N2CC=C)N...
3,915019-65-7,CC(C)(C#N)C1=CC=C(C=C1)N2C3=C4C=C(C=CC4=NC=C3N...
4,Bortezomib,B(C(CC(C)C)NC(=O)C(CC1=CC=CC=C1)NC(=O)C2=NC=CN...


In [4]:
# Defining main dataframe on oneildataset.csv

oneil_ds = pd.read_csv('Datasets/oneildataset.csv')
oneil_ds.head()

Unnamed: 0,block_id,drug_row,drug_col,cell_line_name,study_name,tissue_name,conc_row_unit,conc_col_unit,ic50_row,ic50_col,...,S_mean,S_max,synergy_zip,synergy_loewe,synergy_hsa,synergy_bliss,drug_row_clinical_phase,drug_col_clinical_phase,drug_row_target_name,drug_col_target_name
0,16145,5-Fluorouracil,Veliparib,EFM192B,ONEIL,breast,uM,uM,10.0,10.0,...,20.461,10.0815,-3.608485,-10.705039,-2.050132,-0.719085,4,3,Prelamin-A/C; Survival motor neuron protein; T...,Poly [ADP-ribose] polymerase-1; Poly [ADP-ribo...
1,16146,5-Fluorouracil,Veliparib,EFM192B,ONEIL,breast,uM,uM,10.0,10.0,...,14.846,4.4665,-9.420828,-12.552164,-3.897257,-2.56621,4,3,Prelamin-A/C; Survival motor neuron protein; T...,Poly [ADP-ribose] polymerase-1; Poly [ADP-ribo...
2,16147,5-Fluorouracil,Veliparib,EFM192B,ONEIL,breast,uM,uM,10.0,10.0,...,23.377,12.9975,3.65753,-4.027289,4.627618,5.958665,4,3,Prelamin-A/C; Survival motor neuron protein; T...,Poly [ADP-ribose] polymerase-1; Poly [ADP-ribo...
3,16148,5-Fluorouracil,Veliparib,EFM192B,ONEIL,breast,uM,uM,10.0,10.0,...,17.9165,7.537,0.521115,-9.395789,-0.740882,0.590165,4,3,Prelamin-A/C; Survival motor neuron protein; T...,Poly [ADP-ribose] polymerase-1; Poly [ADP-ribo...
4,16149,5-Fluorouracil,MK-1775,EFM192B,ONEIL,breast,uM,uM,10.0,0.5,...,26.591,26.3835,-8.567179,-1.107976,2.664619,-5.781006,4,2,Prelamin-A/C; Survival motor neuron protein; T...,Serine/threonine-protein kinase TBK1; Protein ...


In [6]:
oneil_ds.columns

Index(['block_id', 'drug_row', 'drug_col', 'cell_line_name', 'study_name',
       'tissue_name', 'conc_row_unit', 'conc_col_unit', 'ic50_row', 'ic50_col',
       'ri_row', 'ri_col', 'css_row', 'css_col', 'css_ri', 'S_sum', 'S_mean',
       'S_max', 'synergy_zip', 'synergy_loewe', 'synergy_hsa', 'synergy_bliss',
       'drug_row_clinical_phase', 'drug_col_clinical_phase',
       'drug_row_target_name', 'drug_col_target_name'],
      dtype='object')

#### Dataset Preprocessing

In [7]:
# extract features from drugs. we used lipinski's rule of five descriptors 

# Inspired by: https://codeocean.com/explore/capsules?query=tag:data-curation

def lipinski(smiles, verbose=False):

    moldata= []
    for elem in smiles:
        mol=Chem.MolFromSmiles(elem) 
        moldata.append(mol)
       
    baseData= np.arange(1,1)
    i=0  
    for mol in moldata:        
       
        desc_MolWt = Descriptors.MolWt(mol)
        desc_MolLogP = Descriptors.MolLogP(mol)
        desc_NumHDonors = Lipinski.NumHDonors(mol)
        desc_NumHAcceptors = Lipinski.NumHAcceptors(mol)
           
        row = np.array([desc_MolWt,
                        desc_MolLogP,
                        desc_NumHDonors,
                        desc_NumHAcceptors])   
    
        if(i==0):
            baseData=row
        else:
            baseData=np.vstack([baseData, row])
        i=i+1      
    
    columnNames=["MW","LogP","NumHDonors","NumHAcceptors"]   
    descriptors = pd.DataFrame(data=baseData,columns=columnNames)
    
    return descriptors

df_lipinski = lipinski(smiles.smiles) # storing features in a dataframe

df_lipinski.head(2)

Unnamed: 0,MW,LogP,NumHDonors,NumHAcceptors
0,130.078,-0.7977,2.0,2.0
1,244.298,1.2604,3.0,3.0


In [9]:
smiles_with_descriptors = pd.concat([smiles, df_lipinski], axis=1) # concatening individual drugs and its features

smiles_with_descriptors

Unnamed: 0,drug_name,smiles,MW,LogP,NumHDonors,NumHAcceptors
0,5-Fluorouracil,O=c1[nH]cc(F)c(=O)[nH]1,130.078,-0.7977,2.0,2.0
1,Veliparib,CC1(CCCN1)C2=NC3=C(C=CC=C3N2)C(=O)N,244.298,1.2604,3.0,3.0
2,MK-1775,CC(C)(C1=NC(=CC=C1)N2C3=NC(=NC=C3C(=O)N2CC=C)N...,500.607,2.886,2.0,10.0
3,915019-65-7,CC(C)(C#N)C1=CC=C(C=C1)N2C3=C4C=C(C=CC4=NC=C3N...,469.548,5.89378,0.0,6.0
4,Bortezomib,B(C(CC(C)C)NC(=O)C(CC1=CC=CC=C1)NC(=O)C2=NC=CN...,384.245,0.3606,4.0,6.0
5,Carboplatinum,C1CC(C1)(C(=O)O)C(=O)O.N.N.[Pt],373.266,0.6474,4.0,4.0
6,cyclophosphamide,C1CNP(=O)(OC1)N(CCCl)CCCl,261.089,1.884,1.0,2.0
7,Dasatinib,CC1=C(C(=CC=C1)Cl)NC(=O)C2=CN=C(S2)NC3=CC(=NC(...,488.017,3.31354,3.0,9.0
8,dexamethasone,CC1CC2C3CCC4=CC(=O)C=CC4(C3(C(CC2(C1(C(=O)CO)O...,392.467,1.8957,3.0,5.0
9,Dinaciclib,CCC1=C2N=C(C=C(N2N=C1)NCC3=C[N+](=CC=C3)[O-])N...,396.495,2.2785,2.0,7.0


In [10]:
smiles_with_descriptors.to_csv('smile_with_desc.csv',index=False) # saving the output in a csv file for later use

smile_with_desc = pd.read_csv('smile_with_desc.csv')

In [11]:
# finding all the possible two-drug combinations from the smiles_with_desc dataframe and storing it in an empty list

import itertools

drug_comb = []
for L in range(0, len(smile_with_desc.drug_name)+1):
    for subset in itertools.combinations(smile_with_desc.drug_name, 2): # 2 is used to restrict the code in forming only 2 combinaitons
        drug_comb.append(subset)

In [12]:
drug_comb[0:6]

[('5-Fluorouracil', 'Veliparib'),
 ('5-Fluorouracil', 'MK-1775'),
 ('5-Fluorouracil', '915019-65-7'),
 ('5-Fluorouracil', 'Bortezomib'),
 ('5-Fluorouracil', 'Carboplatinum'),
 ('5-Fluorouracil', 'cyclophosphamide')]

##### Creating a dataframe of the combined drugs

In [13]:
df_final = pd.DataFrame(columns = ['drug_row','drug_col'])

In [14]:
for i in drug_comb[:6]:
    print(i[0],i[1])

5-Fluorouracil Veliparib
5-Fluorouracil MK-1775
5-Fluorouracil 915019-65-7
5-Fluorouracil Bortezomib
5-Fluorouracil Carboplatinum
5-Fluorouracil cyclophosphamide


In [15]:
for i in drug_comb:
    df_final= df_final.append({'drug_row':i[0],'drug_col':i[1]} ,ignore_index=True)

In [16]:
df_final.duplicated().value_counts()

True     28920
False      720
dtype: int64

In [17]:
df_final = df_final.drop_duplicates()

In [18]:
df_final

Unnamed: 0,drug_row,drug_col
0,5-Fluorouracil,Veliparib
1,5-Fluorouracil,MK-1775
2,5-Fluorouracil,915019-65-7
3,5-Fluorouracil,Bortezomib
4,5-Fluorouracil,Carboplatinum
...,...,...
736,vinorelbine,891494-63-6
737,vinorelbine,MK-4541
738,Vorinostat,891494-63-6
739,Vorinostat,MK-4541


In [19]:
merge_df_final_drug1_with_smiles_features =  pd.merge(df_final , smile_with_desc ,how="left",left_on=['drug_row'],right_on=['drug_name'])

In [20]:
merge_df_final_drug1_with_smiles_features.head(2)

Unnamed: 0,drug_row,drug_col,drug_name,smiles,MW,LogP,NumHDonors,NumHAcceptors
0,5-Fluorouracil,Veliparib,5-Fluorouracil,O=c1[nH]cc(F)c(=O)[nH]1,130.078,-0.7977,2.0,2.0
1,5-Fluorouracil,MK-1775,5-Fluorouracil,O=c1[nH]cc(F)c(=O)[nH]1,130.078,-0.7977,2.0,2.0


In [21]:
merge_df_final_drug2_with_smiles_features = merge_df_final_drug1_with_smiles_features.merge( smile_with_desc ,how="left",left_on=['drug_col'],right_on=['drug_name'],suffixes=('_row','_col'))

In [22]:
merge_df_final_drug2_with_smiles_features = merge_df_final_drug2_with_smiles_features.drop_duplicates()

In [23]:
merge_df_final_drug2_with_smiles_features.head(2)

Unnamed: 0,drug_row,drug_col,drug_name_row,smiles_row,MW_row,LogP_row,NumHDonors_row,NumHAcceptors_row,drug_name_col,smiles_col,MW_col,LogP_col,NumHDonors_col,NumHAcceptors_col
0,5-Fluorouracil,Veliparib,5-Fluorouracil,O=c1[nH]cc(F)c(=O)[nH]1,130.078,-0.7977,2.0,2.0,Veliparib,CC1(CCCN1)C2=NC3=C(C=CC=C3N2)C(=O)N,244.298,1.2604,3.0,3.0
1,5-Fluorouracil,MK-1775,5-Fluorouracil,O=c1[nH]cc(F)c(=O)[nH]1,130.078,-0.7977,2.0,2.0,MK-1775,CC(C)(C1=NC(=CC=C1)N2C3=NC(=NC=C3C(=O)N2CC=C)N...,500.607,2.886,2.0,10.0


In [24]:
merge_df_final_drug2_with_smiles_features.columns

Index(['drug_row', 'drug_col', 'drug_name_row', 'smiles_row', 'MW_row',
       'LogP_row', 'NumHDonors_row', 'NumHAcceptors_row', 'drug_name_col',
       'smiles_col', 'MW_col', 'LogP_col', 'NumHDonors_col',
       'NumHAcceptors_col'],
      dtype='object')

In [25]:
merge_df_final_drug2_with_smiles_features = merge_df_final_drug2_with_smiles_features.drop(labels=['drug_name_row', 'smiles_row','drug_name_col','smiles_col'], axis = 1)

In [26]:
merge_df_final_drug2_with_smiles_features.head(2)

Unnamed: 0,drug_row,drug_col,MW_row,LogP_row,NumHDonors_row,NumHAcceptors_row,MW_col,LogP_col,NumHDonors_col,NumHAcceptors_col
0,5-Fluorouracil,Veliparib,130.078,-0.7977,2.0,2.0,244.298,1.2604,3.0,3.0
1,5-Fluorouracil,MK-1775,130.078,-0.7977,2.0,2.0,500.607,2.886,2.0,10.0


In [27]:
labls = ['drug_row','drug_col']

In [28]:
merged_df_with_oneil_ds = pd.merge(oneil_ds, merge_df_final_drug2_with_smiles_features,on = labls)

In [29]:
merged_df_with_oneil_ds.head(2)

Unnamed: 0,block_id,drug_row,drug_col,cell_line_name,study_name,tissue_name,conc_row_unit,conc_col_unit,ic50_row,ic50_col,...,drug_row_target_name,drug_col_target_name,MW_row,LogP_row,NumHDonors_row,NumHAcceptors_row,MW_col,LogP_col,NumHDonors_col,NumHAcceptors_col
0,16145,5-Fluorouracil,Veliparib,EFM192B,ONEIL,breast,uM,uM,10.0,10.0,...,Prelamin-A/C; Survival motor neuron protein; T...,Poly [ADP-ribose] polymerase-1; Poly [ADP-ribo...,130.078,-0.7977,2.0,2.0,244.298,1.2604,3.0,3.0
1,16146,5-Fluorouracil,Veliparib,EFM192B,ONEIL,breast,uM,uM,10.0,10.0,...,Prelamin-A/C; Survival motor neuron protein; T...,Poly [ADP-ribose] polymerase-1; Poly [ADP-ribo...,130.078,-0.7977,2.0,2.0,244.298,1.2604,3.0,3.0


In [30]:
merged_df_with_oneil_ds.synergy_bliss

0       -0.719085
1       -2.566210
2        5.958665
3        0.590165
4       -5.484395
          ...    
7255     0.833246
7256    -3.446151
7257   -10.614713
7258    -4.272463
7259   -11.578338
Name: synergy_bliss, Length: 7260, dtype: float64

In [31]:
# Dropping unwanted columns from the dataset

columns_to_drop = ['tissue_name','study_name','conc_row_unit', 'conc_col_unit','drug_row_target_name', 'drug_col_target_name']

merged_df_with_oneil_ds = merged_df_with_oneil_ds.drop(labels=columns_to_drop, axis=1)

merged_df_with_oneil_ds.head(2)

Unnamed: 0,block_id,drug_row,drug_col,cell_line_name,ic50_row,ic50_col,ri_row,ri_col,css_row,css_col,...,drug_row_clinical_phase,drug_col_clinical_phase,MW_row,LogP_row,NumHDonors_row,NumHAcceptors_row,MW_col,LogP_col,NumHDonors_col,NumHAcceptors_col
0,16145,5-Fluorouracil,Veliparib,EFM192B,10.0,10.0,19.098,-1.661,23.201,35.158,...,4,3,130.078,-0.7977,2.0,2.0,244.298,1.2604,3.0,3.0
1,16146,5-Fluorouracil,Veliparib,EFM192B,10.0,10.0,19.098,-1.661,16.258,30.871,...,4,3,130.078,-0.7977,2.0,2.0,244.298,1.2604,3.0,3.0


In [32]:
# encoding cell_line_name

merged_df_with_oneil_ds['cell_line_name'].unique().tolist()

['EFM192B', 'KPL1', 'MDAMB436', 'OCUBM', 'T-47D', 'ZR751']

In [33]:
encode = {'EFM192B':1, 'KPL1':2, 'MDAMB436':3, 'OCUBM':4, 'T-47D':5, 'ZR751':6}

In [34]:
merged_df_with_oneil_ds = merged_df_with_oneil_ds.replace({'cell_line_name':encode})

In [35]:
merged_df_with_oneil_ds['cell_line_name'].unique().tolist()

[1, 2, 3, 4, 5, 6]

In [36]:
merged_df_with_oneil_ds.head()

Unnamed: 0,block_id,drug_row,drug_col,cell_line_name,ic50_row,ic50_col,ri_row,ri_col,css_row,css_col,...,drug_row_clinical_phase,drug_col_clinical_phase,MW_row,LogP_row,NumHDonors_row,NumHAcceptors_row,MW_col,LogP_col,NumHDonors_col,NumHAcceptors_col
0,16145,5-Fluorouracil,Veliparib,1,10.0,10.0,19.098,-1.661,23.201,35.158,...,4,3,130.078,-0.7977,2.0,2.0,244.298,1.2604,3.0,3.0
1,16146,5-Fluorouracil,Veliparib,1,10.0,10.0,19.098,-1.661,16.258,30.871,...,4,3,130.078,-0.7977,2.0,2.0,244.298,1.2604,3.0,3.0
2,16147,5-Fluorouracil,Veliparib,1,10.0,10.0,19.098,-1.661,23.611,40.58,...,4,3,130.078,-0.7977,2.0,2.0,244.298,1.2604,3.0,3.0
3,16148,5-Fluorouracil,Veliparib,1,10.0,10.0,19.098,-1.661,19.803,33.467,...,4,3,130.078,-0.7977,2.0,2.0,244.298,1.2604,3.0,3.0
4,27701,5-Fluorouracil,Veliparib,2,8.055629,3.896132,22.529,5.58,29.098,41.526,...,4,3,130.078,-0.7977,2.0,2.0,244.298,1.2604,3.0,3.0


##### Forming a dataset with only the Synergy_bliss values

In [38]:
bliss_labels = ['synergy_hsa', 'synergy_zip','synergy_loewe',]
main_syn_bliss = merged_df_with_oneil_ds.drop(labels=bliss_labels, axis =1)

In [39]:
main_syn_bliss.to_csv('Datasets/main_syn_bliss.csv', index=False)

# Next step is to apply machine learning models to the dataset