In [14]:
import pandas as pd 
import numpy as np
from sklearn import preprocessing

import gc
import warnings 
warnings.filterwarnings('ignore')

## Functions

In [15]:
def df_optimized(df, verbose=True):
    in_size = df.memory_usage(index=True).sum()
    for type in ["float", "integer"]:
        l_cols = list(df.select_dtypes(include=type))
        for col in l_cols:
            df[col] = pd.to_numeric(df[col], downcast=type)
            df[col] = round(df[col],4)
            if type == "float":
                df[col] = pd.to_numeric(df[col], downcast="integer")
    out_size = df.memory_usage(index=True).sum()
    df.replace({False: 0, True: 1}, inplace=True) # converting bool into int 
    ratio = (1 - round(out_size / in_size, 2)) * 100
    GB = out_size / 1000000000
    if verbose:
        print("optimized size by {} % | {} GB".format(ratio, GB))
    return df

## Importing Files

In [16]:
ts_df = pd.read_csv("/Users/george/Desktop/LW-DDI Project/twosides.csv")
reclass = pd.read_csv("/Users/george/Desktop/LW-DDI Project/gl_reclassification_encoded.csv")
fe_drug1 = pd.read_csv("/Users/george/Desktop/LW-DDI Project/cxe_feat_eng_drug1_droppedcolumns.csv")
fe_drug2 = pd.read_csv("/Users/george/Desktop/LW-DDI Project/cxe_feat_eng_drug2_droppedcolumns.csv")

## Simple Cleaning

In [17]:
reclass = reclass[['Y','Y_cat']]
fe_drug1 = fe_drug1.drop(columns = 'Unnamed: 0')
fe_drug2 = fe_drug2.drop(columns ='Unnamed: 0')

## Encoding 

In [18]:
def drug_map(ts_df):
    total_smiles = ts_df['Drug1'].append(ts_df['Drug2']) # combining both drug1 and drug2 smiles together 
    unique_smiles = (set(list(total_smiles))) #create a list of drug_names and extract unique smiles
    unique_smiles = pd.DataFrame(unique_smiles).rename(columns ={0:'Drug'})# create a dataframe for label encoding purposes
    unique_smiles.sort_values(by='Drug', inplace = True)
    unique_smiles['Drug_ID'] = [i for i in range(100,745)] # manual encode them starting from 100 to 745
    drug_dict = pd.Series(unique_smiles.Drug_ID.values, index = unique_smiles.Drug).to_dict()# forming a dictionary to map
    return drug_dict

drug_dict =drug_map(ts_df)

In [19]:
def clean_twosides(ts_df):
    # mapping drug numbers to smiles
    ts_df['Drug1'] = ts_df['Drug1'].map(drug_dict)
    ts_df['Drug2'] = ts_df['Drug2'].map(drug_dict)
    
    # drop irrelavant columns
    ts_df = ts_df.drop(columns = ['Unnamed: 0', 'Drug1_ID','Drug2_ID'])
    
    # creating unique drug ID for drug 1 and drug 2 combination 
    DD_ID =  ts_df['Drug1'].astype(str) + ts_df['Drug2'].astype(str)
    DD_ID = DD_ID.astype('int32')
    ts_df.insert(loc=2,column ='DD_ID', value = DD_ID)
    
    # optimized the df 
    df_optimized(ts_df)
    
    return ts_df.sort_values(by =['DD_ID'])
ts_df = clean_twosides(ts_df)

optimized size by 64.0 % | 0.046494538 GB


In [20]:
ts_df

Unnamed: 0,Drug1,Drug2,DD_ID,Y
713589,100,113,100113,267
713624,100,113,100113,926
713623,100,113,100113,1143
713622,100,113,100113,391
713621,100,113,100113,329
...,...,...,...,...
342688,744,743,744743,1182
3660577,744,743,744743,1118
3983500,744,743,744743,615
3809163,744,743,744743,1270


## Reshaping 

In [21]:
def ts_pivot(ts_df):
    ts_df = ts_df.merge(reclass, on='Y', how='left')  # merging reclassification 
    pivot = ts_df.pivot(columns ='Y_cat',values ='DD_ID').fillna(0).astype('int32') ## pivot Y_cat into columns 
    pivot_df = pd.concat([ts_df,pivot], axis = 1) # concat ts_df to and pivot df 
    pivot_df = pivot_df.drop(columns = ['Drug1','Drug2','Y','Y_cat']).groupby('DD_ID').sum() ## dropping irrelavant columns and groupby by unique ID 
    
    # transforming dataframe into binary 
    for series in pivot_df.columns:
        pivot_df[series] = pivot_df[series].apply(lambda x: x if x == 0 else 1)
        
    # slicing the index = DD_ID, into drug 1 and drug 2 
    Drug1 = pivot_df.index // 1000 
    Drug2 = pivot_df.index % 1000
    
    # Adding back drug 1 and 2 into df 
    pivot_df.insert(loc=0,column ='Drug1', value = Drug1)
    pivot_df.insert(loc=1, column='Drug2',value=Drug2)
    
    return pivot_df.reset_index()

pivot_df = ts_pivot(ts_df)

In [22]:
pivot_df

Unnamed: 0,DD_ID,Drug1,Drug2,0,1,2,3,4,5,6,...,25,26,27,28,29,30,31,32,33,34
0,100113,100,113,0,1,1,1,1,0,0,...,0,1,0,0,0,1,1,0,0,0
1,100123,100,123,1,1,1,1,1,0,0,...,1,1,1,0,0,1,1,0,1,0
2,100144,100,144,0,1,1,0,1,0,0,...,1,1,0,0,0,1,1,0,0,0
3,100211,100,211,0,1,0,1,1,0,0,...,1,0,0,0,0,1,0,0,0,0
4,100213,100,213,1,1,1,1,1,1,1,...,1,1,1,0,1,1,1,1,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
63468,744737,744,737,0,0,1,0,1,0,0,...,1,0,0,0,0,1,0,1,1,0
63469,744739,744,739,1,1,1,1,1,1,0,...,1,1,1,0,1,1,1,1,1,1
63470,744740,744,740,0,1,0,0,1,0,0,...,1,0,1,0,0,0,0,0,0,0
63471,744742,744,742,1,1,1,0,1,0,0,...,1,0,1,0,0,1,1,0,0,0


## Merging with drug features 

In [23]:
# mapping smiles into numbers using drug_dict
fe_drug1['Drug1'] = fe_drug1['Drug1'].map(drug_dict)
fe_drug2['Drug2'] = fe_drug2['Drug2'].map(drug_dict)


# transforming fe_drug2 features by adding _1 at the back of each feature so that we can differentiate drug1 features. 
for col in fe_drug2.columns[1:]:
    fe_drug2.rename(columns = {col: col+'_1'}, inplace=True)
    
# merging with the main dataframe with the drug1 and drug2 features 
df = pivot_df.merge(fe_drug1, on ='Drug1', how = 'left')
df = df.merge(fe_drug2, on ='Drug2', how = 'left')

# replace Booelan series to binary 
df.replace({False: 0, True: 1}, inplace=True)


# differencing the features 
f_list = list(fe_drug1.columns[1:]) # list of features to perform differencing iteration
for col in f_list: # iterating over each feature name 
    df[col+'_diff']= df[col].sub(df[col+'_1']) # creating a new column for differenced features 
    df.drop(columns = [col,col+'_1'], inplace = True) # dropping two columns 
    
df_optimized(df)

optimized size by 67.0 % | 0.126374743 GB


Unnamed: 0,DD_ID,Drug1,Drug2,0,1,2,3,4,5,6,...,SRW09_diff,SRW10_diff,TSRW10_diff,MW_diff,AMW_diff,WPath_diff,WPol_diff,Zagreb1_diff,Zagreb2_diff,mZagreb2_diff
0,100113,100,113,0,1,1,1,1,0,0,...,-7.2218,-1.1304,-74.0895,-1186.603394,-3.7356,-197200007879,-41,-158,-175,-4.1250
1,100123,100,123,1,1,1,1,1,0,0,...,0.0000,2.2652,33.0284,270.221802,-5.4149,2102,38,114,133,5.3333
2,100144,100,144,0,1,1,0,1,0,0,...,0.0000,-0.6562,-3.8462,-24.952101,-0.7735,-468,-8,-26,-41,-0.1944
3,100211,100,211,0,1,0,1,1,0,0,...,-7.1952,-0.0887,-1.6073,173.085907,-0.0307,1732,11,40,33,2.9722
4,100213,100,213,1,1,1,1,1,1,1,...,-7.0501,-0.1302,-4.7457,125.101196,-0.8477,1459,6,30,23,2.2500
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
63468,744737,744,737,0,0,1,0,1,0,0,...,0.0000,6.5265,21.9259,-15.935800,-36.7537,15,0,18,15,0.0000
63469,744739,744,739,1,1,1,1,1,1,0,...,0.0000,7.6251,27.4189,22.020300,-17.7757,-99999984,0,20,16,1.0000
63470,744740,744,740,0,1,0,0,1,0,0,...,0.0000,7.6251,27.4189,57.924000,12.8524,-99999984,0,20,16,1.0000
63471,744742,744,742,1,1,1,0,1,0,0,...,0.0000,7.6251,27.4189,37.994202,-9.7887,-99999984,0,20,16,1.0000


In [26]:
df.describe()

Unnamed: 0,DD_ID,Drug1,Drug2,0,1,2,3,4,5,6,...,SRW09_diff,SRW10_diff,TSRW10_diff,MW_diff,AMW_diff,WPath_diff,WPol_diff,Zagreb1_diff,Zagreb2_diff,mZagreb2_diff
count,63473.0,63473.0,63473.0,63473.0,63473.0,63473.0,63473.0,63473.0,63473.0,63473.0,...,63473.0,63473.0,63473.0,63473.0,63473.0,63473.0,63473.0,63473.0,63473.0,63473.0
mean,421776.326107,421.345533,430.793361,0.284625,0.927512,0.66663,0.49462,0.866841,0.196383,0.10803,...,-0.349763,-0.02092,-2.980403,-25.297638,0.214177,-1479215000.0,-3.123879,-9.73551,-11.583035,-0.381327
std,189338.54369,189.337676,183.09482,0.451239,0.259295,0.471421,0.499975,0.339749,0.397264,0.310421,...,4.893456,1.883626,27.475811,287.122223,4.922274,16712280000.0,37.798473,103.0309,125.255364,4.133774
min,100113.0,100.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-8.9635,-11.6882,-153.663803,-1583.936768,-50.198002,-378200000000.0,-229.0,-626.0,-768.0,-24.486099
25%,258290.0,258.0,270.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,...,-5.4596,-0.6533,-17.6084,-135.068405,-1.1265,-1702.0,-20.0,-56.0,-71.0,-2.1111
50%,426724.0,426.0,443.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,...,0.0,-0.0717,-2.7308,-20.044399,0.0827,-282.0,-3.0,-8.0,-10.0,-0.3194
75%,596587.0,596.0,589.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,...,1.2691,0.4728,11.7396,96.234802,1.3842,864.0,15.0,38.0,49.0,1.5
max,744743.0,744.0,744.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,8.9635,11.6882,152.663803,1475.780762,49.793201,144000000000.0,201.0,550.0,662.0,21.625


In [None]:
# df.to_csv("/Users/george/Desktop/LW-DDI Project/final_df.csv")