# Preprocessing
___

In [3]:
import pandas as pd
import matplotlib.pyplot as plt
import math
import codecs
from SmilesPE.tokenizer import *
from collections import Counter
from deepchem.feat.smiles_tokenizer import SmilesTokenizer
import os
from sklearn.preprocessing import LabelEncoder

  from .autonotebook import tqdm as notebook_tqdm


______
## 1. Drug to Drug interactions dataset (DDI)

### a. Interactions
Create the dataset 'interactions' with confirmed drug to drug interactions

In [4]:
#Interaction involving alimentary tract and metabolism drugs
alimentary = pd.read_csv('../data/drug interactions/ddinter_downloads_code_A.csv')
#Interaction involving blood and blood forming organs drugs
blood = pd.read_csv('../data/drug interactions/ddinter_downloads_code_B.csv')
#Interaction involving dermatological drugs
derma = pd.read_csv('../data/drug interactions/ddinter_downloads_code_D.csv')
#Interaction involving systemic hormonal preparations, excluding sex hormones and insulins drugs
hormonal = pd.read_csv('../data/drug interactions/ddinter_downloads_code_H.csv')
#Interaction involving antineoplastic and immunomodulating agents drugs
immuno = pd.read_csv('../data/drug interactions/ddinter_downloads_code_L.csv')
#Interaction involving antiparasitic products, insecticides and repellents drugs
para = pd.read_csv('../data/drug interactions/ddinter_downloads_code_P.csv')
#Interaction involving respiratory system drugs
respiratory = pd.read_csv('../data/drug interactions/ddinter_downloads_code_R.csv')
#Interaction involving various drugs
various = pd.read_csv('../data/drug interactions/ddinter_downloads_code_V.csv')


**i. DDI dataset (interactions)**<br>
Create a binarized target representing known and unknown interactions between drug_a and drug_b, categorised by the type of DDI involved.

In [5]:
alimentary.shape, blood.shape, derma.shape, hormonal.shape, immuno.shape, para.shape, respiratory.shape, various.shape

((56367, 5),
 (15140, 5),
 (25681, 5),
 (11727, 5),
 (65389, 5),
 (5492, 5),
 (30563, 5),
 (12024, 5))

In [6]:
alimentary.head()

Unnamed: 0,DDInterID_A,Drug_A,DDInterID_B,Drug_B,Level
0,DDInter1263,Naltrexone,DDInter1,Abacavir,Moderate
1,DDInter1,Abacavir,DDInter1348,Orlistat,Moderate
2,DDInter58,Aluminum hydroxide,DDInter582,Dolutegravir,Major
3,DDInter112,Aprepitant,DDInter582,Dolutegravir,Minor
4,DDInter138,Attapulgite,DDInter582,Dolutegravir,Major


In [7]:
#labeling each type of interaction
types = {
    'alimentary': 'alimentary tract and metabolism',
    'blood': 'blood and blood forming organs',
    'derma': 'dermatological',
    'hormonal': 'systemic hormonal preparations',
    'immuno': 'antineoplastic and immunomodulating agents',
    'para': 'antiparasitic products, insecticides and repellents',
    'respiratory': 'respiratory',
    'various': 'various'
}

#renaming the 'type' column in each interaction type df
for key, value in types.items():
    exec(f'{key}["type"] = "{value}"')

#concatenating interaction types into 1 df that will represent all DDIs
interactions = pd.concat([alimentary, blood, derma, hormonal, immuno, para, respiratory, various]).reset_index(drop = True)

In [8]:
interactions.head()

Unnamed: 0,DDInterID_A,Drug_A,DDInterID_B,Drug_B,Level,type
0,DDInter1263,Naltrexone,DDInter1,Abacavir,Moderate,alimentary tract and metabolism
1,DDInter1,Abacavir,DDInter1348,Orlistat,Moderate,alimentary tract and metabolism
2,DDInter58,Aluminum hydroxide,DDInter582,Dolutegravir,Major,alimentary tract and metabolism
3,DDInter112,Aprepitant,DDInter582,Dolutegravir,Minor,alimentary tract and metabolism
4,DDInter138,Attapulgite,DDInter582,Dolutegravir,Major,alimentary tract and metabolism


In [9]:
#no null values
interactions.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 222383 entries, 0 to 222382
Data columns (total 6 columns):
 #   Column       Non-Null Count   Dtype 
---  ------       --------------   ----- 
 0   DDInterID_A  222383 non-null  object
 1   Drug_A       222383 non-null  object
 2   DDInterID_B  222383 non-null  object
 3   Drug_B       222383 non-null  object
 4   Level        222383 non-null  object
 5   type         222383 non-null  object
dtypes: object(6)
memory usage: 10.2+ MB


In [10]:
#removing unnecessary columns (IDs)
interactions = interactions.drop(columns = ['DDInterID_A', 'DDInterID_B'])
interactions.columns = interactions.columns.str.lower()

#binarizing 'level' to represent 1 for an interaction, 0 for no interaction
interactions['level'] = interactions['level'].map({'Unknown':0, 'Minor': 1, 'Moderate': 1, 'Major': 1})
interactions = interactions.rename(columns = {'level': 'interaction'})
interactions['interaction'].value_counts(normalize = True)

1    0.787835
0    0.212165
Name: interaction, dtype: float64

In [11]:
interactions.head()

Unnamed: 0,drug_a,drug_b,interaction,type
0,Naltrexone,Abacavir,1,alimentary tract and metabolism
1,Abacavir,Orlistat,1,alimentary tract and metabolism
2,Aluminum hydroxide,Dolutegravir,1,alimentary tract and metabolism
3,Aprepitant,Dolutegravir,1,alimentary tract and metabolism
4,Attapulgite,Dolutegravir,1,alimentary tract and metabolism


### b. Known drugs
Create the dataset 'known_drugs' that represents those that have a known positive interaction (1) and explore different routes of administration.

**i. Find unique positive interactions**<br> <br>**Findings:** 1902 unique drugs that are involved in the dataset.

In [12]:
#only show unique known positive intereactions
known_drugs = interactions[interactions['interaction'] == 1].reset_index(drop = True).drop(columns = ['interaction'])
known_drugs = known_drugs.drop_duplicates(subset= ['drug_a', 'drug_b'])

known_drugs.shape

(130422, 3)

In [13]:
#drug names will be in lower case
for x in known_drugs.columns:
    known_drugs[x] = known_drugs[x].str.lower()

In [21]:
#create list of unique drugs with positive interactions
def node_list(df):
    node_list = []
    for col in df.columns:
        for node in df[col]:
            node_list.append(node)
            
    #removing duplicates
    node_list = list(set(node_list))
    return node_list

In [15]:
all_drugs = node_list(known_drugs[['drug_a', 'drug_b']])
all_drugs = pd.DataFrame(all_drugs, columns = ['drug_name'])

In [16]:
all_drugs.head()

Unnamed: 0,drug_name
0,indacaterol
1,cefiderocol
2,romidepsin
3,irbesartan
4,sitagliptin


In [17]:
#1902 unique drugs in our interaction dataset
all_drugs.shape

(1902, 1)

**ii. Exploring routes of administration**<br>
Some of the names have annotation describing the route of administration i.e. physostigmine and physostigmine (ophthalmic) meaning the same drug but applied to the eye. Does the route of administration effect the interactions of the drug? Should they be removed?<br>

**Findings:** If I get rid of the annotated versions i wont lose much information as they either share most of the same interactions as the regular version of the drug or don't contain many DDIs.

In [18]:
#example of the annnotation
all_drugs[all_drugs['drug_name'].str.contains('physostigmine')]

Unnamed: 0,drug_name
637,physostigmine
828,physostigmine (ophthalmic)


In [19]:
#complete list of the drugs
ad = all_drugs.copy()

#complete list of the drugs
#flagging drugs with the () notation
ad['flag'] = ad['drug_name'].str.find('(')

#drugs that aren't flagged with -1 are True
ad = ad[ad['flag'] != -1]

In [20]:
ad.head()

Unnamed: 0,drug_name,flag
17,dapsone (topical),8
29,bromfenac (ophthalmic),10
30,cinchocaine (topical),12
33,insulin human (isophane),14
35,tretinoin (topical),10


In [30]:
#seperating nomenclature with '()'
ad['drug_name'] = ad['drug_name'].str.split('(')
ad['annotation'] = ad['drug_name'].apply(lambda x: x[1][:-1])
ad['drug_name'] = ad['drug_name'].apply(lambda x: x[0])

In [31]:
#list of the special annotations
ad['annotation'].value_counts()[:10]

topical                     84
ophthalmic                  51
nasal                       19
liposomal                    6
liposome                     3
human                        3
formaldehyde inactivated     2
otic                         2
recombinant                  2
sodium                       2
Name: annotation, dtype: int64

In [None]:
#known DDIs with ophthalmic version of Physostigmine
ophthalmic = known_drugs[known_drugs['drug_b'] == 'physostigmine (ophthalmic)']
#known DDis with regular Physostigmine
regular = known_drugs[known_drugs['drug_b'] == 'physostigmine']

In [34]:
#ophthalmic doesn't seem to have as many DDIs as the regular version of Physostigmine
ophthalmic.shape, regular.shape

((49, 3), (81, 3))

In [35]:
#the ophthalmic version only has 1 different drug interaction to the regular
#this means it doesn't have much of an impact to remove the ophthalmic version
set(ophthalmic['drug_a']) - set(regular['drug_a'])

{'ipratropium'}

In [36]:
#theres only 1 instance of Benzalkonium (topical)
known_drugs[known_drugs['drug_b'].str.contains('benzalkonium')]

Unnamed: 0,drug_a,drug_b,type
70228,hyaluronic acid,benzalkonium (topical),dermatological


In [37]:
#there are only 2 instances of Dinoprostone (topical)
known_drugs[known_drugs['drug_a'].str.contains('dinoprostone')]

Unnamed: 0,drug_a,drug_b,type
19301,dinoprostone (topical),misoprostol,alimentary tract and metabolism
46825,dinoprostone (topical),urea,blood and blood forming organs


____
## 2. Latent factors dataset - Chemical Structures

Create a 'chem_sim' dataset that represents the chemical structures that are associated to each drug with a known positive reaction. Followed by tokenisation of these chemical structures.

We will be referencing collaborative filtering methodology which relies on the assumption that similar drugs will have similar interactions with other drugs. These 'representations' of the drugs will be another aspect of their profiles. This requires latent features that are 'representations' of the same information given by the DDIs, and their respective drugs. These will be used as indications of the relationships between drugs. Examples of these derive from the profiles of these drugs that would lead to a DDI i.e. side effects, metabolic pathways, targets, and chemical structures. I've chosen chemical structures to represent the latent space, and will create a similarity score between every drug. 

#### Feature engineering Drug Similarities using Chemical Structures


The chemical structure notation are written in following the Simplified Molecular Input Line Entry System (SMILES). Some rules to account for:<br>
- All elements of the periodic table can be represented with their letters, further clarification can be made by putting the element within square brackets []
- Bonds are represented as -,=,#,*,.
- Entire branches are denoted within parenthesis ()
- Carbons are annotated with with a number to indicate its position i.e. C1, C2...
- Charged atoms are denoted with {}
- Tetrahedral centers indicated by @



### a. Chemical Structures

Connect chemical structures to their associated drugs found in known_drugs.

**i. Cleaning Structures dataset**

In [22]:
#chemical structures of drugs
structures = pd.read_csv('../data/drug similarities/structure_links.csv')

In [25]:
#lowercase columns
structures.columns = structures.columns.str.lower().str.replace(' ', '_')
structures.head(2)

Unnamed: 0,name,smiles,inchikey,inchi,formula
0,Bivalirudin,CC[C@H](C)[C@H](NC(=O)[C@H](CCC(O)=O)NC(=O)[C@...,OIRCOABEOLEUMC-GEJPAHFPSA-N,InChI=1S/C98H138N24O33/c1-5-52(4)82(96(153)122...,C98H138N24O33
1,Leuprolide,CCNC(=O)[C@@H]1CCCN1C(=O)[C@H](CCCNC(N)=N)NC(=...,GFIJNRVAKGFPGQ-LIJARHBVSA-N,InChI=1S/C59H84N16O12/c1-6-63-57(86)48-14-10-2...,C59H84N16O12


In [24]:
structures.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11912 entries, 0 to 11911
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   name      11912 non-null  object
 1   smiles    11297 non-null  object
 2   inchikey  11300 non-null  object
 3   inchi     11300 non-null  object
 4   formula   11302 non-null  object
dtypes: object(5)
memory usage: 465.4+ KB


In [26]:
#relevant columns
structures= structures[['name', 'smiles']]

#removing null values
structures = structures.dropna(axis = 0)
structures.isna().sum()

name      0
smiles    0
dtype: int64

**ii. Merging chemical structures to the drugs found in all_drugs**<br>
Resulting from the merge, the amount of unique drugs were reduced from 1910 to 1404. This was translated into the original known_drugs DDI dataset.

In [28]:
#preparing for merge chemical structures to the relevant drugs in all_drugs
structures = structures.rename(columns = {'name': 'drug_name'})

In [29]:
#drug_names will be lower case
all_drugs['drug_name'] = all_drugs['drug_name'].str.lower()
structures['drug_name'] = structures['drug_name'].str.lower()      
all_drugs.shape

(1902, 1)

In [30]:
chem_sim = pd.merge(all_drugs, structures, on= 'drug_name', how='inner')

#the resulting merge reduced individual drugs from 1902 to 1413
chem_sim.shape

(1413, 2)

In [31]:
c = node_list(chem_sim[['drug_name']])
len(c)

1413

**iii. Address mismatch in unique drugs of known_drugs vs chem_sim**

In [32]:
#new list of drugs that we focus on
all_drugs_modified = chem_sim[['drug_name']]

#list of all of the drugs where we don't have chemical structure data on
dropped = list(set(all_drugs['drug_name']) - set(all_drugs_modified['drug_name']))
len(dropped)

489

In [34]:
dropped[:5]

['trastuzumab',
 'abaloparatide',
 'dextran (low molecular weight)',
 'mumps virus strain b level jeryl lynn live antigen',
 'ephedrine (nasal)']

In [35]:
#dropping all interactions that have drugs that don't appear in new drug list
known_drugs.shape

(130422, 3)

In [53]:
all_drugs_modified = list(chem_sim['drug_name'])

#modified dataset of known DDIs excluding the drugs that were dropped
known_drugs_modified = known_drugs.copy()
known_drugs_modified = known_drugs_modified[known_drugs_modified['drug_a'].isin(all_drugs_modified)]
known_drugs_modified = known_drugs_modified[known_drugs_modified['drug_b'].isin(all_drugs_modified)]

In [55]:
#still have 89008 DDIs
known_drugs_modified.shape

(89008, 3)

**iv. address the further reduction from exception interactions**<br>
Some interactions that were dropped held unique drugs that only had 1 interaction, resulting in the loss of that unique drug. 

In [56]:
len(all_drugs_modified)

1413

In [57]:
#the filtering has further reduced the number of unique drugs. why?
after_culling = node_list(known_drugs_modified[['drug_a', 'drug_b']])
len(after_culling)

1404

In [58]:
all_drugs = node_list(known_drugs_modified[['drug_a', 'drug_b']])
set(c) - set(all_drugs)

{'cromoglicic acid',
 'etidocaine',
 'hyaluronic acid',
 'mepivacaine',
 'migalastat',
 'miglustat',
 'nedocromil',
 'pyrophosphoric acid',
 'zanamivir'}

In [59]:
#reason for reduced unique drugs is that the drugs in the above list are solely involved with a drug that was removed
#this deems them unimportant
known_drugs[known_drugs['drug_b'] == 'cromoglicic acid']

Unnamed: 0,drug_a,drug_b,type
17634,"insulin human (inhalation, rapid acting)",cromoglicic acid,alimentary tract and metabolism


In [60]:
#DDIs with 1404 unique drugs
known_drugs= known_drugs_modified
all_drugs = pd.DataFrame(all_drugs, columns = ['drug_name'])

**v. one more time**

In [61]:
#further aligning of chem_sim drug portfolio to the modified known_drugs drug portfolio
chem_sim = chem_sim[chem_sim['drug_name'].isin(list(all_drugs['drug_name']))]

### b. Tokenisation of SMILES

**Seperating the chemical structure of the drugs whilst retaining relevant characteristics in notation**<br>
The SMILES notation needs to be tokenised appropriately in to account for substructures expressed in the notation. There are already many different tokenisers available online for this task. Instead of the basic versions of SMILES tokenisers, I've decided to use one that has been pre-trained from ChEMBL data (another database that uses the same SMILES notation) from SmilesPE.

**i. Applying pre-trained SMILES tokenizer: SmilesPE**

In [37]:
#opening vocab list
spe_vocab = codecs.open('../resources/SPE_ChEMBL.txt')
#initialising tokenizer
spe = SPE_Tokenizer(spe_vocab)

In [38]:
c = chem_sim.copy()
#applying tokenization to smiles and appending to tokens list
tokens = []
for x in c['smiles']:
    tokens.append(spe.tokenize(x))

In [39]:
#adding tokens to new column, splitting spaces so that each entry is a list of tokens
chem_sim['tokens'] = tokens
chem_sim['tokens'] = chem_sim['tokens'].map(lambda x: x.split(' '))

In [40]:
#the tokenization has been successful in keeping characteristics of each element in the chemical structure
chem_sim.head()

Unnamed: 0,drug_name,smiles,tokens
0,indacaterol,CCC1=C(CC)C=C2CC(CC2=C1)NC[C@H](O)C1=C2C=CC(=O...,"[CCC1, =C(, CC), C=C2, CC(, CC2, =, C1), NC, [..."
1,cefiderocol,[H][C@]12SCC(C[N+]3(CCNC(=O)C4=C(Cl)C(O)=C(O)C...,"[[H], [C@]12, S, CC(, C, [N+], 3, (, CCNC(=O),..."
2,romidepsin,C\C=C1/NC(=O)[C@H]2CSSCC\C=C\[C@H](CC(=O)N[C@H...,"[C, \, C=C1, /, NC(=O), [C@H]2, CS, S, CC, \, ..."
3,irbesartan,CCCCC1=NC2(CCCC2)C(=O)N1CC1=CC=C(C=C1)C1=CC=CC...,"[CCCCC1, =N, C2(, CCCC2), C(=O)N1, CC1=, CC, =..."
4,sitagliptin,N[C@@H](CC(=O)N1CCN2C(C1)=NN=C2C(F)(F)F)CC1=CC...,"[N[C@@H](, CC(=O)N1, CCN2, C(, C1), =N, N=C2, ..."


In [41]:
#saving this version of chem_sim for eda later on
chem_sim.to_csv('../data/cleaned data/chem_sim_eda.csv', index = False)

In [42]:
chem_sim = chem_sim.drop(columns = ['smiles'])

**ii. Label encoding tokens. (Pre-processing for modeling)**<br>

In [43]:
labelencoder = LabelEncoder()

In [44]:
#exploding tokens column to then label them into numerical data
encode = chem_sim[['tokens']]
encode = encode.explode('tokens')
#label encoding
encode['labels'] = labelencoder.fit_transform(encode['tokens'])
encode = encode.reset_index()
#groupby the index and aggregate to a list. output are the label encoded features in 1 list
encode = encode.groupby('index').agg(list)
chem_sim['labels'] = encode['labels']

In [45]:
chem_sim.head()

Unnamed: 0,drug_name,tokens,labels
0,indacaterol,"[CCC1, =C(, CC), C=C2, CC(, CC2, =, C1), NC, [...","[407, 94, 364, 317, 327, 377, 92, 253, 645, 85..."
1,cefiderocol,"[[H], [C@]12, S, CC(, C, [N+], 3, (, CCNC(=O),...","[920, 891, 745, 327, 134, 926, 77, 31, 488, 30..."
2,romidepsin,"[C, \, C=C1, /, NC(=O), [C@H]2, CS, S, CC, \, ...","[134, 960, 316, 61, 649, 873, 561, 745, 324, 9..."
3,irbesartan,"[CCCCC1, =N, C2(, CCCC2), C(=O)N1, CC1=, CC, =...","[431, 113, 282, 425, 157, 371, 324, 94, 311, 2..."
4,sitagliptin,"[N[C@@H](, CC(=O)N1, CCN2, C(, C1), =N, N=C2, ...","[670, 331, 485, 139, 253, 113, 644, 219, 371, ..."


In [71]:
chem_sim.to_csv('../data/cleaned data/chem_sim_vectors.csv', index = False)

### c. Creating nodes representing positive known DDIs

Each drug will be assigned a unique numerical ID. They will then represent nodes in a graph.

In [46]:
known_list = node_list(known_drugs)
len(known_list)

1910

In [74]:
#saving version of known_drugs with the names and types before converting into numerical nodes
known_drugs.to_csv('../data/cleaned data/known_drugs_names.csv', index = False)

In [47]:
#applying numerical IDs for each drug that represent nodes
known_drugs= known_drugs[['drug_a', 'drug_b']]
nodes = known_drugs
nodes = nodes.stack().rank(method='dense').astype(int).unstack()

#making nodes start from 0
nodes  = nodes.apply(lambda x: x-1)
nodes = nodes.rename(columns = {'drug_a': 'druga_id', 'drug_b': 'drugb_id'})
known_drugs = pd.concat([known_drugs, nodes], axis = 1)
known_drugs = known_drugs.sort_values(by = 'druga_id', ascending = True).reset_index(drop = True)
known_drugs.head()

Unnamed: 0,drug_a,drug_b,druga_id,drugb_id
0,abacavir,peginterferon beta-1a,0,1356
1,abacavir,orlistat,0,1296
2,abacavir,trabectedin,0,1775
3,abacavir,teriflunomide,0,1708
4,abacavir,leflunomide,0,988


**i. Drug_id lookup**

A list of all drugs (nodes) with their associated ID.

In [49]:
#list of all unique drugs in each row of drug_a and drug_b
druga_lookup = known_drugs[['druga_id', 'drug_a']].drop_duplicates(subset = 'drug_a').reset_index(drop = True)
drugb_lookup = known_drugs[['drugb_id', 'drug_b']].drop_duplicates(subset = 'drug_b').reset_index(drop = True)

In [50]:
#drug b has more individual IDs 
druga_lookup.shape, drugb_lookup.shape

((1735, 2), (1812, 2))

In [51]:
druga_lookup.head()

Unnamed: 0,druga_id,drug_a
0,0,abacavir
1,2,abametapir (topical)
2,3,abarelix
3,4,abatacept
4,5,abciximab


In [52]:
#cross checking that the assigned nodes are the same for both columns
drugb_lookup = drugb_lookup.sort_values(by = 'drugb_id', ascending = True)
drugb_lookup.head()

Unnamed: 0,drugb_id,drug_b
1411,0,abacavir
1554,1,abaloparatide
390,2,abametapir (topical)
1565,3,abarelix
900,4,abatacept


In [57]:
#merging the drug_a and drug_b lookups since drug_b has other unique drugs
drugb_lookup = drugb_lookup.rename(columns = {'drugb_id': 'druga_id'})
all_drugs_lookup = pd.merge(drugb_lookup, druga_lookup, on = 'druga_id', how = 'outer')
all_drugs_lookup['drug_b'] = all_drugs_lookup['drug_b'].fillna(all_drugs_lookup['drug_a'])
all_drugs_lookup = all_drugs_lookup.drop(columns = ['drug_a'])
all_drugs_lookup = all_drugs_lookup.rename(columns = {'drug_b': 
                                                     'drug_name'})
#complete list of the drugs and their associated nodes
all_drugs_lookup = all_drugs_lookup.sort_values(by = 'druga_id', ascending = True).reset_index(drop = True)
all_drugs_lookup.head()

Unnamed: 0,druga_id,drug_name
0,0,abacavir
1,1,abaloparatide
2,2,abametapir (topical)
3,3,abarelix
4,4,abatacept


**ii. DDIs represented as nodes**

In [90]:
#we want just 2 columns representing 2 nodes
known_drugs = known_drugs.drop(columns = ['drug_a', 'drug_b'])
known_drugs.head()

Unnamed: 0,druga_id,drugb_id
0,0,174
1,0,945
2,0,295
3,0,1305
4,0,1257


In [91]:
all_drugs_lookup.to_csv('../data/cleaned data/all_drugs_lookup.csv', index = False)
interactions.to_csv('../data/cleaned data/interactions.csv', index = False)
known_drugs.to_csv('../data/cleaned data/known_drugs.csv', index = False)
chem_sim.to_csv('../data/cleaned data/chem_sim.csv', index = False)