In [2]:
import pandas as pd 
import numpy as np
from sklearn import preprocessing

import gc
import warnings 
warnings.filterwarnings('ignore')

## importing datasets

In [3]:
ts_df = pd.read_csv("/Users/george/Desktop/LW-DDI Project/twosides.csv")
fe_drug1 = pd.read_csv("/Users/george/Desktop/LW-DDI Project/cxe_feat_eng_drug1_droppedcolumns.csv")
fe_drug2 = pd.read_csv("/Users/george/Desktop/LW-DDI Project/cxe_feat_eng_drug2_droppedcolumns.csv")

## Cleaning data 

In [4]:
fe_drug1 = fe_drug1.drop(columns = 'Unnamed: 0')
fe_drug2 = fe_drug2.drop(columns ='Unnamed: 0')
ts_df = ts_df.drop(columns = 'Unnamed: 0')
ts_filtered = ts_df[['Drug1','Drug2','Y']] # filtering smiles and Y from twosides.csv

In [5]:
def df_optimized(df, verbose=True):
    in_size = df.memory_usage(index=True).sum()
    for type in ["float", "integer"]:
        l_cols = list(df.select_dtypes(include=type))
        for col in l_cols:
            df[col] = pd.to_numeric(df[col], downcast=type)
            df[col] = round(df[col],4)
            if type == "float":
                df[col] = pd.to_numeric(df[col], downcast="integer")
    out_size = df.memory_usage(index=True).sum()
    df.replace({False: 0, True: 1}, inplace=True) # converting bool into int 
    ratio = (1 - round(out_size / in_size, 2)) * 100
    GB = out_size / 1000000000
    if verbose:
        print("optimized size by {} % | {} GB".format(ratio, GB))
    return df

# optimising datasets 
fe_drug1 = df_optimized(fe_drug1)
fe_drug2 = df_optimized(fe_drug2)

optimized size by 65.99999999999999 % | 0.001197632 GB
optimized size by 65.99999999999999 % | 0.001236428 GB


## Merging Datasets

In [6]:
merged_drug1 = ts_filtered[['Drug1']].merge(fe_drug1, on='Drug1', how='left')
merged_drug2 = ts_filtered[['Drug2','Y']].merge(fe_drug2, on='Drug2', how='left')

for col in merged_drug2.columns[2:]:
    merged_drug2.rename(columns = {col: col+'_1'}, inplace=True)

final_merge = pd.concat([merged_drug1,merged_drug2],axis =1)
final_df = final_merge.dropna()

## Encoding Smiles to Numbers

In [7]:
total_smiles = final_df['Drug1'].append(final_df['Drug2']) # combining both drug1 and drug2 smiles together 
unique_smiles = (set(list(total_smiles))) #create a list of drug_names and extract unique smiles
unique_smiles = pd.DataFrame(unique_smiles).rename(columns ={0:'Drug'}) # create a dataframe for label encoding purposes

# fit and transform. 
le = preprocessing.LabelEncoder()
le.fit(unique_smiles[['Drug']])
unique_smiles['Drug_number'] = le.transform(unique_smiles[['Drug']])

# downcasting 
unique_smiles['Drug_number']= \
    pd.to_numeric(unique_smiles['Drug_number'], downcast="integer")


# forming a dictionary to map onto the final_df
drug_dict = pd.Series(unique_smiles.Drug_number.values, index = unique_smiles.Drug).to_dict()

# mapping drug numbers to smiles in the final_df 
final_df['Drug1'] = final_df['Drug1'].map(drug_dict)
final_df['Drug2'] = final_df['Drug2'].map(drug_dict)

## Differencing

In [8]:
## deleting dataframes that are not required to save up ram 
del merged_drug1 
del merged_drug2
del final_merge
del ts_df
del ts_filtered
gc.collect()

0

In [9]:
reclass = pd.read_csv("/Users/george/Desktop/LW-DDI Project/gl_reclassification_encoded.csv")
reclass = reclass[['Y','Y_cat']]

In [10]:
## splitting the datasets 

df_list = []

start = 0
last = 500_000
for i in range(1,10):
    if i == 9:
        df_list.append(final_df.iloc[last:])
    else:
        df_list.append(final_df.iloc[start:last])
        start += 500_000
        last += 500_000
        
del final_df 
gc.collect()

# extracting the all the features for drug1 and drug2
f_list = list(fe_drug1.columns[1:])

final_df = 0
for index, df in enumerate(df_list):
    print(index)
    for col in f_list:
        df[col+'_diff']= df[col].sub(df[col+'_1'])
        df.drop(columns = [col,col+'_1'], inplace = True)
    df_merged = df.merge(reclass, on='Y', how='left')
    del df_list[index]
    gc.collect()
    if index == 0:
        final_df = df_merged
    else:
        final_df = pd.concat([final_df,df_merged])  

0
1
2
3
4


Unnamed: 0,Drug1,ABC,ABCGG,nAcid,nBase,nAromAtom,nAromBond,nAtom,nHeavyAtom,nSpiro,...,SRW09_1,SRW10_1,TSRW10_1,MW_1,AMW_1,WPath_1,WPol_1,Zagreb1_1,Zagreb2_1,mZagreb2_1
3500000,529,26.058201,20.0369,3,0,26,27,53,33,0,...,8.6185,10.7075,83.929703,392.098511,8.168700,1573,46,156,198,5.3333
3500001,437,19.270201,14.2994,0,0,12,12,45,25,0,...,0.0000,9.6602,44.220402,373.060211,13.817000,3300000110,14,50,60,2.2500
3500002,360,14.308500,12.8113,0,0,6,6,32,19,0,...,0.0000,11.0439,90.834198,747.476807,6.177500,9201,104,274,330,11.3889
3500003,84,12.118300,10.7886,0,0,9,10,27,16,0,...,7.4018,11.0129,80.078499,410.165985,7.324400,1670,70,168,221,5.8194
3500004,60,14.349000,12.5849,0,0,12,12,34,19,0,...,0.0000,0.0000,2.000000,73.932602,36.966301,100000000,0,0,0,0.0000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3999995,369,17.653799,14.3213,0,0,17,17,35,22,0,...,0.0000,9.1513,43.556099,180.042206,8.573400,246,16,60,66,2.9722
3999996,34,17.246201,15.3827,0,0,16,16,34,22,0,...,7.4018,11.0129,80.078499,426.136414,7.609600,1670,70,168,221,5.8194
3999997,542,16.320499,13.1245,0,1,12,12,44,21,0,...,0.0000,9.2345,49.527500,260.173615,6.194600,692,20,78,81,4.2083
3999998,105,11.451600,10.3312,0,0,6,6,26,15,0,...,7.7583,10.8679,82.397598,416.202087,6.823000,1905,61,174,221,5.9375


In [21]:
final_df

Unnamed: 0,Drug1,Drug2,Y,ABC_diff,ABCGG_diff,nAcid_diff,nBase_diff,nAromAtom_diff,nAromBond_diff,nAtom_diff,...,SRW10_diff,TSRW10_diff,MW_diff,AMW_diff,WPath_diff,WPol_diff,Zagreb1_diff,Zagreb2_diff,mZagreb2_diff,Y_cat
0,278,408,767,-0.215599,1.567100,1,0,-6,-6,-11,...,0.5914,15.244205,11.882172,1.944400,-315,3,8,19,-0.6528,9
1,278,408,25,-0.215599,1.567100,1,0,-6,-6,-11,...,0.5914,15.244205,11.882172,1.944400,-315,3,8,19,-0.6528,1
2,278,408,85,-0.215599,1.567100,1,0,-6,-6,-11,...,0.5914,15.244205,11.882172,1.944400,-315,3,8,19,-0.6528,20
3,278,408,735,-0.215599,1.567100,1,0,-6,-6,-11,...,0.5914,15.244205,11.882172,1.944400,-315,3,8,19,-0.6528,16
4,278,408,959,-0.215599,1.567100,1,0,-6,-6,-11,...,0.5914,15.244205,11.882172,1.944400,-315,3,8,19,-0.6528,25
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
149436,6,367,912,-9.587900,-5.935700,0,0,-14,-15,-15,...,-1.3292,-31.428703,-111.073807,-0.147100,-791,-22,-74,-98,-1.7222,12
149437,561,75,1281,4.221701,1.406500,-4,0,7,7,8,...,0.0513,1.071003,28.070190,-1.635900,417,14,28,36,1.3333,4
149438,439,428,289,14.894001,14.807800,0,-2,-15,-16,80,...,-0.3948,6.983406,377.441711,-0.975100,25233,8,58,19,6.0695,22
149439,119,554,2,2.218801,4.250000,2,2,-6,-6,16,...,0.0849,4.133301,48.111725,-1.362401,479,7,12,13,1.1666,6


In [None]:
final_df.to_csv("/Users/george/Desktop/LW-DDI Project/final_df.csv")

## Reclassification and encoding

In [None]:
## code to encode 
# reclass = pd.read_csv("/Users/george/Desktop/LW-DDI Project/mt_reclassified_twosides_labels.csv")
# reclass = reclass.rename(columns = {'Unnamed: 0': 'Y'}).dropna().drop(columns ='side_effect')
# reclass

# le = preprocessing.LabelEncoder()
# le.fit(reclass[['sub_system']])
# reclass['Y_cat']= le.transform(reclass[['sub_system']])
# reclass_cat = reclass[['Y','Y_cat']]

In [None]:
## import reclassification csv for consistency purposes 


In [None]:
reclass

In [None]:
ten_percent_df = df1.merge(reclass_cat, on='Y', how='left')

## Exporting CSV

In [None]:
ten_percent_df.to_csv("/Users/george/Desktop/LW-DDI Project/ten_percent_df.csv")

In [None]:
unique_smiles.to_csv("/Users/george/Desktop/LW-DDI Project/gl_unique_smiles_encoded.csv")