In [2]:
import pandas as pd 
import numpy as np
from sklearn import preprocessing

import gc
import warnings 
warnings.filterwarnings('ignore')

## importing datasets

In [3]:
ts_df = pd.read_csv("/Users/george/Desktop/LW-DDI Project/twosides.csv")
fe_drug1 = pd.read_csv("/Users/george/Desktop/LW-DDI Project/cxe_feat_eng_drug1_droppedcolumns.csv")
fe_drug2 = pd.read_csv("/Users/george/Desktop/LW-DDI Project/cxe_feat_eng_drug2_droppedcolumns.csv")

## Cleaning data 

In [4]:
fe_drug1 = fe_drug1.drop(columns = 'Unnamed: 0')
fe_drug2 = fe_drug2.drop(columns ='Unnamed: 0')
ts_df = ts_df.drop(columns = 'Unnamed: 0')
ts_filtered = ts_df[['Drug1','Drug2','Y']] # filtering smiles and Y from twosides.csv

In [5]:
def df_optimized(df, verbose=True):
    in_size = df.memory_usage(index=True).sum()
    for type in ["float", "integer"]:
        l_cols = list(df.select_dtypes(include=type))
        for col in l_cols:
            df[col] = pd.to_numeric(df[col], downcast=type)
            df[col] = round(df[col],4)
            if type == "float":
                df[col] = pd.to_numeric(df[col], downcast="integer")
    out_size = df.memory_usage(index=True).sum()
    df.replace({False: 0, True: 1}, inplace=True) # converting bool into int 
    ratio = (1 - round(out_size / in_size, 2)) * 100
    GB = out_size / 1000000000
    if verbose:
        print("optimized size by {} % | {} GB".format(ratio, GB))
    return df

# optimising datasets 
fe_drug1 = df_optimized(fe_drug1)
fe_drug2 = df_optimized(fe_drug2)

optimized size by 65.99999999999999 % | 0.001197632 GB
optimized size by 65.99999999999999 % | 0.001236428 GB


## Merging Datasets

In [6]:
merged_drug1 = ts_filtered[['Drug1']].merge(fe_drug1, on='Drug1', how='left')
merged_drug2 = ts_filtered[['Drug2','Y']].merge(fe_drug2, on='Drug2', how='left')

for col in merged_drug2.columns[2:]:
    merged_drug2.rename(columns = {col: col+'_1'}, inplace=True)

final_merge = pd.concat([merged_drug1,merged_drug2],axis =1)
final_df = final_merge.dropna()

## Encoding Smiles to Numbers

In [7]:
total_smiles = final_df['Drug1'].append(final_df['Drug2']) # combining both drug1 and drug2 smiles together 
unique_smiles = (set(list(total_smiles))) #create a list of drug_names and extract unique smiles
unique_smiles = pd.DataFrame(unique_smiles).rename(columns ={0:'Drug'}) # create a dataframe for label encoding purposes

# fit and transform. 
le = preprocessing.LabelEncoder()
le.fit(unique_smiles[['Drug']])
unique_smiles['Drug_number'] = le.transform(unique_smiles[['Drug']])

# downcasting 
unique_smiles['Drug_number']= \
    pd.to_numeric(unique_smiles['Drug_number'], downcast="integer")


# forming a dictionary to map onto the final_df
drug_dict = pd.Series(unique_smiles.Drug_number.values, index = unique_smiles.Drug).to_dict()

# mapping drug numbers to smiles in the final_df 
final_df['Drug1'] = final_df['Drug1'].map(drug_dict)
final_df['Drug2'] = final_df['Drug2'].map(drug_dict)

## Differencing

In [8]:
## deleting dataframes that are not required to save up ram 
del merged_drug1 
del merged_drug2
del final_merge
del ts_df
del ts_filtered
gc.collect()

0

In [9]:
reclass = pd.read_csv("/Users/george/Desktop/LW-DDI Project/gl_reclassification_encoded.csv")
reclass = reclass[['Y','Y_cat']]

In [10]:
## splitting the datasets 

df_list = []

start = 0
last = 500_000
for i in range(1,10):
    if i == 9:
        df_list.append(final_df.iloc[last:])
    else:
        df_list.append(final_df.iloc[start:last])
        start += 500_000
        last += 500_000
        
del final_df 
gc.collect()

# extracting the all the features for drug1 and drug2
f_list = list(fe_drug1.columns[1:])

final_df = 0
for index, df in enumerate(df_list):
    print(index)
    for col in f_list:
        df[col+'_diff']= df[col].sub(df[col+'_1'])
        df.drop(columns = [col,col+'_1'], inplace = True)
    df_merged = df.merge(reclass, on='Y', how='left')
    del df_list[index]
    gc.collect()
    if index == 0:
        final_df = df_merged
    else:
        final_df = pd.concat([final_df,df_merged])  

0
1
2
3
4


In [None]:
final_df.to_csv("/Users/george/Desktop/LW-DDI Project/final_df.csv")

## Reclassification and encoding

In [None]:
## code to encode 
# reclass = pd.read_csv("/Users/george/Desktop/LW-DDI Project/mt_reclassified_twosides_labels.csv")
# reclass = reclass.rename(columns = {'Unnamed: 0': 'Y'}).dropna().drop(columns ='side_effect')
# reclass

# le = preprocessing.LabelEncoder()
# le.fit(reclass[['sub_system']])
# reclass['Y_cat']= le.transform(reclass[['sub_system']])
# reclass_cat = reclass[['Y','Y_cat']]

In [None]:
## import reclassification csv for consistency purposes 


In [None]:
reclass

In [None]:
ten_percent_df = df1.merge(reclass_cat, on='Y', how='left')

## Exporting CSV

In [None]:
ten_percent_df.to_csv("/Users/george/Desktop/LW-DDI Project/ten_percent_df.csv")

In [None]:
unique_smiles.to_csv("/Users/george/Desktop/LW-DDI Project/gl_unique_smiles_encoded.csv")