In [223]:
import pandas as pd
import matplotlib.pyplot as plt
import math
import codecs
from SmilesPE.tokenizer import *
from collections import Counter
from deepchem.feat.smiles_tokenizer import SmilesTokenizer
import os

### 1. Drug to Drug interactions (DDI)
Curating a data set with confirmed drug to drug interactions

In [339]:
#Interaction involving alimentary tract and metabolism drugs
alimentary = pd.read_csv('../data/drug interactions/ddinter_downloads_code_A.csv')
#Interaction involving blood and blood forming organs drugs
blood = pd.read_csv('../data/drug interactions/ddinter_downloads_code_B.csv')
#Interaction involving dermatological drugs
derma = pd.read_csv('../data/drug interactions/ddinter_downloads_code_D.csv')
#Interaction involving systemic hormonal preparations, excluding sex hormones and insulins drugs
hormonal = pd.read_csv('../data/drug interactions/ddinter_downloads_code_H.csv')
#Interaction involving antineoplastic and immunomodulating agents drugs
immuno = pd.read_csv('../data/drug interactions/ddinter_downloads_code_L.csv')
#Interaction involving antiparasitic products, insecticides and repellents drugs
para = pd.read_csv('../data/drug interactions/ddinter_downloads_code_P.csv')
#Interaction involving respiratory system drugs
respiratory = pd.read_csv('../data/drug interactions/ddinter_downloads_code_R.csv')
#Interaction involving various drugs
various = pd.read_csv('../data/drug interactions/ddinter_downloads_code_V.csv')


#chemical structures of drugs
structures = pd.read_csv('../data/drug similarities/structure_links.csv')

**i. DDI dataset (interactions)**<br>
We want a binarized target representing known and unknown interactions, drug_a, drug_b and the type of DDI involved.

In [225]:
alimentary.shape, blood.shape, derma.shape, hormonal.shape, immuno.shape, para.shape, respiratory.shape, various.shape

((56367, 5),
 (15140, 5),
 (25681, 5),
 (11727, 5),
 (65389, 5),
 (5492, 5),
 (30563, 5),
 (12024, 5))

In [226]:
alimentary.head()

Unnamed: 0,DDInterID_A,Drug_A,DDInterID_B,Drug_B,Level
0,DDInter1263,Naltrexone,DDInter1,Abacavir,Moderate
1,DDInter1,Abacavir,DDInter1348,Orlistat,Moderate
2,DDInter58,Aluminum hydroxide,DDInter582,Dolutegravir,Major
3,DDInter112,Aprepitant,DDInter582,Dolutegravir,Minor
4,DDInter138,Attapulgite,DDInter582,Dolutegravir,Major


In [227]:
#labeling each type of interaction
alimentary['type'] = 'alimentary tract and metabolism'
blood['type'] = 'blood and blood forming organs'
derma['type'] = 'dermatological'
hormonal['type'] = 'systemic hormonal preparations'
immuno['type'] = 'antineoplastic and immunomodulating agents'
para['type'] = 'antiparasitic products, insecticides and repellents'
respiratory['type'] = 'respiratory'
various['type'] = 'various'

In [228]:
#concatenating all drug types into 1 df that will represent all DDIs
interactions = pd.concat([alimentary, blood, derma, hormonal, immuno, para, respiratory, various]).reset_index(drop = True)

In [229]:
interactions.head()

Unnamed: 0,DDInterID_A,Drug_A,DDInterID_B,Drug_B,Level,type
0,DDInter1263,Naltrexone,DDInter1,Abacavir,Moderate,alimentary tract and metabolism
1,DDInter1,Abacavir,DDInter1348,Orlistat,Moderate,alimentary tract and metabolism
2,DDInter58,Aluminum hydroxide,DDInter582,Dolutegravir,Major,alimentary tract and metabolism
3,DDInter112,Aprepitant,DDInter582,Dolutegravir,Minor,alimentary tract and metabolism
4,DDInter138,Attapulgite,DDInter582,Dolutegravir,Major,alimentary tract and metabolism


In [230]:
#no null values
interactions.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 222383 entries, 0 to 222382
Data columns (total 6 columns):
 #   Column       Non-Null Count   Dtype 
---  ------       --------------   ----- 
 0   DDInterID_A  222383 non-null  object
 1   Drug_A       222383 non-null  object
 2   DDInterID_B  222383 non-null  object
 3   Drug_B       222383 non-null  object
 4   Level        222383 non-null  object
 5   type         222383 non-null  object
dtypes: object(6)
memory usage: 10.2+ MB


In [231]:
#removing unnecessary IDs
interactions = interactions.drop(columns = ['DDInterID_A', 'DDInterID_B'])

In [232]:
interactions.columns = interactions.columns.str.lower()

In [233]:
#binarizing 'level' to represent 1 for an interaction, 0 for no interaction
interactions['level'] = interactions['level'].map({'Unknown':0, 'Minor': 1, 'Moderate': 1, 'Major': 1})

In [234]:
interactions = interactions.rename(columns = {'level': 'interaction'})

In [235]:
interactions['interaction'].value_counts(normalize = True)

1    0.787835
0    0.212165
Name: interaction, dtype: float64

In [236]:
interactions.head()

Unnamed: 0,drug_a,drug_b,interaction,type
0,Naltrexone,Abacavir,1,alimentary tract and metabolism
1,Abacavir,Orlistat,1,alimentary tract and metabolism
2,Aluminum hydroxide,Dolutegravir,1,alimentary tract and metabolism
3,Aprepitant,Dolutegravir,1,alimentary tract and metabolism
4,Attapulgite,Dolutegravir,1,alimentary tract and metabolism


### 2. Known drugs dataset

Only interested with drugs that have interaction (1). Found 1902 unique drugs that are involved in the dataset.

In [237]:
known_drugs = interactions[interactions['interaction'] == 1].reset_index(drop = True).drop(columns = ['interaction'])

In [238]:
known_drugs.shape

(175201, 3)

In [239]:
known_drugs = known_drugs.drop_duplicates(subset= ['drug_a', 'drug_b'])

In [240]:
known_drugs.shape

(130422, 3)

In [241]:
#drug names will be in lower case
for x in known_drugs.columns:
    known_drugs[x] = known_drugs[x].str.lower()

#### Number of unique drugs in known_drugs

In [242]:
def node_list(df):
    node_list = []
    for col in df.columns:
        for node in df[col]:
            node_list.append(node)
            
    #removing duplicates
    node_list = list(set(node_list))
    return node_list

In [243]:
#creating list of all drugs found in DDIs
all_drugs = node_list(known_drugs[['drug_a', 'drug_b']])

In [244]:
all_drugs = pd.DataFrame(all_drugs, columns = ['drug_name'])

In [245]:
all_drugs.head()

Unnamed: 0,drug_name
0,fentanyl
1,interferon beta-1a
2,indomethacin
3,bexarotene
4,trimetrexate


In [246]:
#1902 unique drugs in our interaction dataset
all_drugs.shape

(1902, 1)

**ii. Exploring naming convention of drugs from known_drugs**<br>
Some of the names have annotation describing the route of administration i.e. physostigmine and physostigmine (ophthalmic) meaning the same drug but applied to the eye. Does the route of administration affect the interactions of the drug? Should they be removed?<br>

**Findings:** If I get rid of the annotated versions i wont lose much information as they either share most of the same interactions as the regular version of the drug or don't contain many DDIs.

In [247]:
#example of the annnotation
all_drugs[all_drugs['drug_name'].str.contains('physostigmine')]

Unnamed: 0,drug_name
499,physostigmine (ophthalmic)
1636,physostigmine


In [248]:
#complete list of the drugs
ad = all_drugs.copy()

In [249]:
#complete list of the drugs
#flagging drugs with the () notation
ad['flag'] = ad['drug_name'].str.find('(')

In [250]:
#drugs that aren't flagged with -1 are True
ad = ad[ad['flag'] != -1]

In [251]:
ad.head()

Unnamed: 0,drug_name,flag
7,minocycline (topical),12
10,scopolamine (ophthalmic),12
30,trifarotene (topical),12
57,human immunoglobulin g (intravenous),23
80,iobenguane (i-131),11


In [252]:
#seperating nomenclature with '()'
ad['drug_name'] = ad['drug_name'].str.split('(')
ad['annotation'] = ad['drug_name'].apply(lambda x: x[1][:-1])
ad['drug_name'] = ad['drug_name'].apply(lambda x: x[0])

In [253]:
#list of the special annotations
ad['annotation'].value_counts()[:10]

topical        84
ophthalmic     51
nasal          19
liposomal       6
liposome        3
human           3
i-131           2
recombinant     2
live            2
otic            2
Name: annotation, dtype: int64

In [254]:
#known DDIs with ophthalmic version of Physostigmine
ophthalmic = known_drugs[known_drugs['drug_b'] == 'physostigmine (ophthalmic)']

In [255]:
#known DDis with regular Physostigmine
regular = known_drugs[known_drugs['drug_b'] == 'physostigmine']

In [256]:
#ophthalmic doesn't seem to have as many DDIs as the regular version of Physostigmine
ophthalmic.shape, regular.shape

((49, 3), (81, 3))

In [257]:
#the ophthalmic version only has 1 different drug interaction to the regular
#this means it doesn't have much of an impact to remove the ophthalmic version
set(ophthalmic['drug_a']) - set(regular['drug_a'])

{'ipratropium'}

In [258]:
#theres only 1 instance of Benzalkonium (topical)
known_drugs[known_drugs['drug_b'].str.contains('benzalkonium')]

Unnamed: 0,drug_a,drug_b,type
70228,hyaluronic acid,benzalkonium (topical),dermatological


In [259]:
#there are only 2 instances of Dinoprostone (topical)
known_drugs[known_drugs['drug_a'].str.contains('dinoprostone')]

Unnamed: 0,drug_a,drug_b,type
19301,dinoprostone (topical),misoprostol,alimentary tract and metabolism
46825,dinoprostone (topical),urea,blood and blood forming organs


## Latent factors
We will be using collaborative filtering methodology which relies on the assumption that similar drugs will have similar interactions. These 'representations' of the drugs will be another aspect of their profiles. This requires latent features that are 'representations' of the same information given by the DDIs, and their respective drugs. These will be used as indications of the relationships between drugs. Examples of these derive from the profiles of these drugs that would lead to a DDI i.e. side effects, metabolic pathways, targets, and chemical structures. I've chosen chemical structures to represent the latent space, and will create a similarity score between every drug. 

### 1. Feature engineering Drug Similarities using Chemical Structures


The chemical structure notation are written in following the Simplified Molecular Input Line Entry System (SMILES). Some rules to account for:<br>
- All elements of the periodic table can be represented with their letters, further clarification can be made by putting the element within square brackets []
- Bonds are represented as -,=,#,*,.
- Entire branches are denoted within parenthesis ()
- Carbons are annotated with with a number to indicate its position i.e. C1, C2...
- Charged atoms are denoted with {}
- Tetrahedral centers indicated by @



**

**i. Cleaning Structures dataset**

In [334]:
structures.columns = structures.columns.str.lower().str.replace(' ', '_')

In [335]:
structures.head(2)

Unnamed: 0,drugbank_id,name,cas_number,drug_groups,inchikey,inchi,smiles,formula,kegg_compound_id,kegg_drug_id,pubchem_compound_id,pubchem_substance_id,chebi_id,chembl_id,het_id,chemspider_id,bindingdb_id
0,DB00006,Bivalirudin,128270-60-0,approved; investigational,OIRCOABEOLEUMC-GEJPAHFPSA-N,InChI=1S/C98H138N24O33/c1-5-52(4)82(96(153)122...,CC[C@H](C)[C@H](NC(=O)[C@H](CCC(O)=O)NC(=O)[C@...,C98H138N24O33,,D03136,16129704.0,46507415.0,59173.0,CHEMBL2103749,,10482069.0,50248103.0
1,DB00007,Leuprolide,53714-56-0,approved; investigational,GFIJNRVAKGFPGQ-LIJARHBVSA-N,InChI=1S/C59H84N16O12/c1-6-63-57(86)48-14-10-2...,CCNC(=O)[C@@H]1CCCN1C(=O)[C@H](CCCNC(N)=N)NC(=...,C59H84N16O12,C07612,D08113,,46507635.0,6427.0,CHEMBL1201199,,571356.0,50369395.0


In [336]:
structures.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11912 entries, 0 to 11911
Data columns (total 17 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   drugbank_id           11912 non-null  object 
 1   name                  11912 non-null  object 
 2   cas_number            8005 non-null   object 
 3   drug_groups           11912 non-null  object 
 4   inchikey              11300 non-null  object 
 5   inchi                 11300 non-null  object 
 6   smiles                11297 non-null  object 
 7   formula               11302 non-null  object 
 8   kegg_compound_id      2324 non-null   object 
 9   kegg_drug_id          2046 non-null   object 
 10  pubchem_compound_id   8720 non-null   float64
 11  pubchem_substance_id  9194 non-null   float64
 12  chebi_id              5224 non-null   float64
 13  chembl_id             7854 non-null   object 
 14  het_id                6024 non-null   object 
 15  chemspider_id      

In [337]:
#relevant columns
structures= structures[['name', 'smiles']]
#removing null values
structures = structures.dropna(axis = 0)

In [264]:
structures.isna().sum()

name      0
smiles    0
dtype: int64

**ii. Merging chemical structures to the drugs found in all_drugs**<br>
Resulting from the merge, the amount of unique drugs were reduced from 1910 to 1404. This was translated into the original known_drugs DDI dataset.

In [265]:
#preparing for merge chemical structures to the relevant drugs in all_drugs
structures = structures.rename(columns = {'name': 'drug_name'})

In [266]:
#drug_names will be lower case
all_drugs['drug_name'] = all_drugs['drug_name'].str.lower()
structures['drug_name'] = structures['drug_name'].str.lower()      

In [267]:
all_drugs.shape

(1902, 1)

In [268]:
chem_sim = pd.merge(all_drugs, structures, on= 'drug_name', how='inner')

In [269]:
#the resulting merge reduced individual drugs from 1902 to 1413
chem_sim.shape

(1413, 2)

In [270]:
c = node_list(chem_sim[['drug_name']])
len(c)

1413

**iii. Address mismatch in unique drugs of known_drugs vs chem_sim**

In [271]:
#new list of drugs that we focus on
all_drugs_modified = chem_sim[['drug_name']]

In [272]:
#list of all of the drugs where we don't have chemical structure data on
dropped = list(set(all_drugs['drug_name']) - set(all_drugs_modified['drug_name']))
len(dropped)

489

In [273]:
dropped[:5]

['nivolumab',
 'resorcinol (topical)',
 'interferon beta-1a',
 'desoximetasone (topical)',
 'doxepin (topical)']

In [274]:
#dropping all interactions that have drugs that don't appear in new drug list
known_drugs.shape

(130422, 3)

In [275]:
all_drugs_modified = list(chem_sim['drug_name'])

In [276]:
#modified dataset of known DDIs excluding the drugs that were dropped
known_drugs_modified = known_drugs.copy()
known_drugs_modified = known_drugs_modified[known_drugs_modified['drug_a'].isin(all_drugs_modified)]
known_drugs_modified = known_drugs_modified[known_drugs_modified['drug_b'].isin(all_drugs_modified)]

In [277]:
#still have 89008 DDIs
known_drugs_modified.shape

(89008, 3)

**iv. address the further reduction from exception interactions**<br>
Some interactions that were dropped held unique drugs that only had 1 interaction, resulting in the loss of that unique drug. 

In [278]:
len(all_drugs_modified)

1413

In [279]:
#the filtering has further reduced the number of unique drugs. why?
after_culling = node_list(known_drugs_modified[['drug_a', 'drug_b']])
len(after_culling)

1404

In [280]:
all_drugs = node_list(known_drugs_modified[['drug_a', 'drug_b']])
set(c) - set(all_drugs)

{'cromoglicic acid',
 'etidocaine',
 'hyaluronic acid',
 'mepivacaine',
 'migalastat',
 'miglustat',
 'nedocromil',
 'pyrophosphoric acid',
 'zanamivir'}

In [281]:
#reason for reduced unique drugs is that the drugs in the above list are solely involved with a drug that was removed
#this deems them unimportant
known_drugs[known_drugs['drug_b'] == 'cromoglicic acid']

Unnamed: 0,drug_a,drug_b,type
17634,"insulin human (inhalation, rapid acting)",cromoglicic acid,alimentary tract and metabolism


In [282]:
#DDIs with 1404 unique drugs
known_drugs= known_drugs_modified
all_drugs = pd.DataFrame(all_drugs, columns = ['drug_name'])

**v. one more time**

In [283]:
#further aligning of chem_sim drug portfolio to the modified known_drugs drug portfolio
chem_sim = chem_sim[chem_sim['drug_name'].isin(list(all_drugs['drug_name']))]

### 2. Tokenisation of SMILES

**Seperating the chemical structure of the drugs whilst retaining relevant characteristics in notation**<br>
The SMILES notation needs to be tokenised appropriately in to account for characteristics expressed in the notation. There are already many different tokenisers available online for this task. Instead of the basic versions of SMILES tokenisers, I've decided to use one that has been pre-trained from ChEMBL data (another database that uses the same SMILES notation) from SmilesPE.

**i. Applying pre-trained SMILES tokenizer: SmilesPE**

In [284]:
#opening vocab list
spe_vocab = codecs.open('../resources/SPE_ChEMBL.txt')
#initialising tokenizer
spe = SPE_Tokenizer(spe_vocab)

In [285]:
c = chem_sim.copy()
#applying tokenization to smiles and appending to tokens list
tokens = []
for x in c['smiles']:
    tokens.append(spe.tokenize(x))

In [286]:
#adding tokens to new column, splitting spaces so that each entry is a list of tokens
chem_sim['tokens'] = tokens
chem_sim['tokens'] = chem_sim['tokens'].map(lambda x: x.split(' '))

In [287]:
#the tokenization has been successful in keeping characteristics of each element in the chemical structure
chem_sim.head()

Unnamed: 0,drug_name,smiles,tokens
0,fentanyl,CCC(=O)N(C1CCN(CCC2=CC=CC=C2)CC1)C1=CC=CC=C1,"[CC, C(=O)N(, C1CCN(, CCC2, =, CC=, CC, =C, 2)..."
1,indomethacin,COC1=CC2=C(C=C1)N(C(=O)C1=CC=C(Cl)C=C1)C(C)=C2...,"[CO, C1=, CC2, =C(, C=, C1), N(C(=O), C1=, CC,..."
2,bexarotene,CC1=CC2=C(C=C1C(=C)C1=CC=C(C=C1)C(O)=O)C(C)(C)...,"[CC1=, CC2, =C(, C=C1, C(=C), C1=, CC, =C(, C=..."
3,trimetrexate,COC1=CC(NCC2=C(C)C3=C(C=C2)N=C(N)N=C3N)=CC(OC)...,"[CO, C1=, CC(N, CC2, =C(C), C3=C(, C=C, 2), N=..."
4,chloramphenicol,OC[C@@H](NC(=O)C(Cl)Cl)[C@H](O)C1=CC=C(C=C1)[N...,"[OC, [C@@H](NC(=O), C(Cl), Cl), [C@H](O), C1=,..."


In [288]:
#saving this version of chem_sim for eda later on
chem_sim.to_csv('../data/cleaned data/chem_sim_eda.csv', index = False)

In [289]:
chem_sim = chem_sim.drop(columns = ['smiles'])

In [None]:
encode = chem_sim[['tokens']]
encode = encode.explode('tokens')

**ii. Aligning DDIs with their latent factors**

In [290]:
#DDIs with their associated chem structures
drug_a = known_drugs[['drug_a']].rename(columns = {'drug_a': 'drug_name'})
drug_b = known_drugs[['drug_b']].rename(columns = {'drug_b': 'drug_name'})
drug_a_chem = pd.merge(drug_a, chem_sim, on = 'drug_name', how = 'left')
drug_b_chem = pd.merge(drug_b, chem_sim, on = 'drug_name', how = 'left')

In [291]:
known_drugs.shape, drug_a_chem.shape, drug_b_chem.shape

((89008, 3), (89008, 2), (89008, 2))

In [292]:
#chem_sim dataset reflecting the DDIs with chemical structures to reference
chem_sim = pd.merge(drug_a_chem, drug_b_chem, left_index = True, right_index = True)

In [293]:
known_drugs.head()

Unnamed: 0,drug_a,drug_b,type
0,naltrexone,abacavir,alimentary tract and metabolism
1,abacavir,orlistat,alimentary tract and metabolism
2,aluminum hydroxide,dolutegravir,alimentary tract and metabolism
3,aprepitant,dolutegravir,alimentary tract and metabolism
4,attapulgite,dolutegravir,alimentary tract and metabolism


In [294]:
chem_sim.head()

Unnamed: 0,drug_name_x,tokens_x,drug_name_y,tokens_y
0,naltrexone,"[[H], [C@@]12, OC3, =C(O), C=, CC4, =C3, [C@@]...",abacavir,"[NC1=N, C2=C(, N=, CN2, [C@@H]2, C[C@H](, CO),..."
1,abacavir,"[NC1=N, C2=C(, N=, CN2, [C@@H]2, C[C@H](, CO),...",orlistat,"[CCCCCCCC, CCC[C@@H](, C[C@@H]1, OC(=O), [C@H]..."
2,aluminum hydroxide,"[[OH-], ., [OH-], ., [OH-], ., [Al+3]]",dolutegravir,"[[H], [C@]12, CN3, C=C(, C(=O)N, CC4, =, CC, =..."
3,aprepitant,"[C[C@@H](, O[C@H]1, OCCN(, CC2, =N, NC(=O)N, 2...",dolutegravir,"[[H], [C@]12, CN3, C=C(, C(=O)N, CC4, =, CC, =..."
4,attapulgite,"[[O--], ., [O--], ., [O--], ., [O--], ., [O--]...",dolutegravir,"[[H], [C@]12, CN3, C=C(, C(=O)N, CC4, =, CC, =..."


### Cosine similarity
Pure cosine similarity based item-item CF: Based on users’ interaction with product universe we calculated similarity(distance between pairs of item vectors) scores for items pars and finally cosine similarity matrix. Then matrix dot product between similarity matrix(12620 X 12620) and user-item matrix(5352 X 12620)generates item scores(a list 12620 scores for each user)across all users and picking up top 20 product for each user solves the problem.

In [295]:
chem = chem_sim[['tokens_x', 'tokens_y']]

In [296]:
chem.iloc[0]

tokens_x    [[H], [C@@]12, OC3, =C(O), C=, CC4, =C3, [C@@]...
tokens_y    [NC1=N, C2=C(, N=, CN2, [C@@H]2, C[C@H](, CO),...
Name: 0, dtype: object

In [297]:
#Counter objects disect the elements of the chem structure and provides a count
Counter(chem.iloc[0][0])

Counter({'[H]': 2,
         '[C@@]12': 1,
         'OC3': 1,
         '=C(O)': 1,
         'C=': 1,
         'CC4': 1,
         '=C3': 1,
         '[C@@]1': 1,
         '1': 1,
         'CCN(': 1,
         'CC3CC3)': 1,
         '[C@](': 1,
         ')': 1,
         '(C': 1,
         '4)': 1,
         '[C@]1(O)': 1,
         'CCC2': 1,
         '=O': 1})

In [298]:
Counter(chem.iloc[0][1])

Counter({'NC1=N': 1,
         'C2=C(': 1,
         'N=': 1,
         'CN2': 1,
         '[C@@H]2': 1,
         'C[C@H](': 1,
         'CO)': 1,
         'C=C': 1,
         '2)': 1,
         'C(N': 1,
         'C2CC2)': 1,
         '=N1': 1})

In [299]:
# code from Martjin Pieters
#calculates the cosine similarities between the Counter objects 
def counter_cosine_similarity(c1, c2):
    terms = set(c1).union(c2)
    dotprod = sum(c1.get(k, 0) * c2.get(k, 0) for k in terms)
    magA = math.sqrt(sum(c1.get(k, 0)**2 for k in terms))
    magB = math.sqrt(sum(c2.get(k, 0)**2 for k in terms))
    return dotprod / (magA * magB)

In [300]:
cos_sim = []
for i in range(0, chem.shape[0]):
    counter_a = Counter(chem.iloc[i][0])
    counter_b = Counter(chem.iloc[i][1])
    cos_sim.append(counter_cosine_similarity(counter_a, counter_b)) 

In [301]:
len(cos_sim)

89008

In [302]:
#assigning cosine similarity scores to new column
chem_sim['similarity_score'] = cos_sim

In [303]:
chem_sim.head()

Unnamed: 0,drug_name_x,tokens_x,drug_name_y,tokens_y,similarity_score
0,naltrexone,"[[H], [C@@]12, OC3, =C(O), C=, CC4, =C3, [C@@]...",abacavir,"[NC1=N, C2=C(, N=, CN2, [C@@H]2, C[C@H](, CO),...",0.0
1,abacavir,"[NC1=N, C2=C(, N=, CN2, [C@@H]2, C[C@H](, CO),...",orlistat,"[CCCCCCCC, CCC[C@@H](, C[C@@H]1, OC(=O), [C@H]...",0.0
2,aluminum hydroxide,"[[OH-], ., [OH-], ., [OH-], ., [Al+3]]",dolutegravir,"[[H], [C@]12, CN3, C=C(, C(=O)N, CC4, =, CC, =...",0.0
3,aprepitant,"[C[C@@H](, O[C@H]1, OCCN(, CC2, =N, NC(=O)N, 2...",dolutegravir,"[[H], [C@]12, CN3, C=C(, C(=O)N, CC4, =, CC, =...",0.23355
4,attapulgite,"[[O--], ., [O--], ., [O--], ., [O--], ., [O--]...",dolutegravir,"[[H], [C@]12, CN3, C=C(, C(=O)N, CC4, =, CC, =...",0.077311


In [343]:
explode = chem_sim[['tokens_x']]
explode.explode('tokens_x')

Unnamed: 0,tokens_x
0,[H]
0,[C@@]12
0,OC3
0,=C(O)
0,C=
...,...
89007,C1CCCCC1)
89007,C1=
89007,CC=
89007,CC


In [304]:
chem_sim['tokens_x'][:1]

0    [[H], [C@@]12, OC3, =C(O), C=, CC4, =C3, [C@@]...
Name: tokens_x, dtype: object

In [305]:
chem_sim[chem_sim['similarity_score']==0].shape

(23161, 5)

### Creating nodes representing DDI

In [306]:
known_list = node_list(known_drugs)

In [307]:
len(known_list)

1412

In [308]:
#saving version of known_drugs with the names and types before converting into numerical nodes
known_drugs.to_csv('../data/cleaned data/known_drugs_names.csv', index = False)

In [309]:
#applying numerical IDs for each drug that represent nodes
known_drugs= known_drugs[['drug_a', 'drug_b']]
nodes = known_drugs
nodes = nodes.stack().rank(method='dense').astype(int).unstack()

In [310]:
#making nodes start from 0
nodes  = nodes.apply(lambda x: x-1)
nodes = nodes.rename(columns = {'drug_a': 'druga_id', 'drug_b': 'drugb_id'})

In [311]:
known_drugs = pd.concat([known_drugs, nodes], axis = 1)

In [312]:
known_drugs = known_drugs.sort_values(by = 'druga_id', ascending = True).reset_index(drop = True)

In [313]:
known_drugs.head()

Unnamed: 0,drug_a,drug_b,druga_id,drugb_id
0,abacavir,cabozantinib,0,174
1,abacavir,orlistat,0,945
2,abacavir,cobicistat,0,295
3,abacavir,trabectedin,0,1305
4,abacavir,teriflunomide,0,1257


### Drug_id lookup

In [314]:
#list of all unique drugs in each row of drug_a and drug_b
druga_lookup = known_drugs[['druga_id', 'drug_a']].drop_duplicates(subset = 'drug_a').reset_index(drop = True)
drugb_lookup = known_drugs[['drugb_id', 'drug_b']].drop_duplicates(subset = 'drug_b').reset_index(drop = True)

In [315]:
#drug b has more individual IDs 
druga_lookup.shape, drugb_lookup.shape

((1307, 2), (1361, 2))

In [316]:
druga_lookup.head()

Unnamed: 0,druga_id,drug_a
0,0,abacavir
1,1,abarelix
2,2,abemaciclib
3,3,abiraterone
4,4,acalabrutinib


In [317]:
#cross checking that the assigned nodes are the same for both columns
drugb_lookup = drugb_lookup.sort_values(by = 'drugb_id', ascending = True)
drugb_lookup.head()

Unnamed: 0,drugb_id,drug_b
1202,0,abacavir
1360,1,abarelix
825,2,abemaciclib
604,3,abiraterone
219,4,acalabrutinib


In [318]:
#merging the drug_a and drug_b lookups since drug_b has other unique drugs
drugb_lookup = drugb_lookup.rename(columns = {'drugb_id': 'druga_id'})

In [319]:
all_drugs_lookup = pd.merge(drugb_lookup, druga_lookup, on = 'druga_id', how = 'outer')

In [320]:
all_drugs_lookup['drug_b'] = all_drugs_lookup['drug_b'].fillna(all_drugs_lookup['drug_a'])

In [321]:
all_drugs_lookup = all_drugs_lookup.drop(columns = ['drug_a'])
all_drugs_lookup = all_drugs_lookup.rename(columns = {'drug_b': 
                                                     'drug_name'})

In [322]:
#complete list of the drugs and their associated nodes
all_drugs_lookup = all_drugs_lookup.sort_values(by = 'druga_id', ascending = True).reset_index(drop = True)

In [323]:
all_drugs_lookup.head()

Unnamed: 0,druga_id,drug_name
0,0,abacavir
1,1,abarelix
2,2,abemaciclib
3,3,abiraterone
4,4,acalabrutinib


In [324]:
#we want just 2 columns representing 2 nodes
known_drugs = known_drugs.drop(columns = ['drug_a', 'drug_b'])
known_drugs.head()

Unnamed: 0,druga_id,drugb_id
0,0,174
1,0,945
2,0,295
3,0,1305
4,0,1257


In [325]:
# n = chem_sim.shape[0]*chem_sim.shape[0]
# drug_a = chem_sim.sample(n=n, replace=True, random_state=0).reset_index(drop=True)
# drug_b = chem_sim.sample(n=n, replace=True, random_state=42).reset_index(drop=True)
# combos = pd.merge(drug_a,drug_b, left_index=True, right_index=True)

In [326]:
all_drugs_lookup.to_csv('../data/cleaned data/all_drugs_lookup.csv', index = False)
interactions.to_csv('../data/cleaned data/interactions.csv', index = False)
known_drugs.to_csv('../data/cleaned data/known_drugs.csv', index = False)
chem_sim.to_csv('../data/cleaned data/chem_sim.csv', index = False)

### Toxicity

In [327]:
toxicity = pd.read_csv('../data/toxicity/Boxed_warnings_for_drugs_CompToxSI.csv')

In [328]:
toxicity.head()

Unnamed: 0,parent_chemblid,salt_chemblid,salt_prefname,fda_synonyms,note,note_id,toxicity_class,mislabelled,extracted_on,fda_boxed_warning,fda_application_number,fda_set_id,fda_effective_time,fda_substance_name,fda_boxedwarning_example
0,CHEMBL1000,CHEMBL1000,CETIRIZINE,,Not found in OpenFDA,-1,,,20190331,,,,,,0
1,CHEMBL1000,CHEMBL1607273,CETIRIZINE HYDROCHLORIDE,Cetirizine HCl|Cetirizine,No Boxed Warning for the salt in any medicinal...,0,,,20190331,,,,,,0
2,CHEMBL100116,CHEMBL100116,PENTAZOCINE,,Boxed warning for a combination medicinal prod...,2,misuse,not_mislabelled,20190331,"WARNING: ADDICTION, ABUSE, AND MISUSE; RISK EV...",ANDA075523,41ebdaaf-3bbc-419f-b996-0341efc14623,20180731.0,NALOXONE HYDROCHLORIDE|PENTAZOCINE,1
3,CHEMBL100116,CHEMBL100116,PENTAZOCINE,,Boxed warning for a combination medicinal prod...,2,respiratory_toxicity,not_mislabelled,20190331,"WARNING: ADDICTION, ABUSE, AND MISUSE; RISK EV...",ANDA075523,41ebdaaf-3bbc-419f-b996-0341efc14623,20180731.0,NALOXONE HYDROCHLORIDE|PENTAZOCINE,1
4,CHEMBL100116,CHEMBL3989510,PENTAZOCINE HYDROCHLORIDE,Pentazocine HCl,Boxed warning for a combination medicinal prod...,2,misuse,not_mislabelled,20190331,"WARNING: ADDICTION, ABUSE, AND MISUSE; RISK EV...",ANDA075735,017ef042-40aa-44c9-baa8-037f06356845,20190222.0,NALOXONE HYDROCHLORIDE|PENTAZOCINE HYDROCHLORIDE,1


In [329]:
toxicity['toxicity_class'].value_counts()

cardiotoxicity              1682
teratogenicity              1602
neurotoxicity               1516
misuse                      1431
psychiatric_toxicity        1430
gastrotoxicity              1300
hepatotoxicity              1247
respiratory_toxicity        1226
vascular_toxicity            575
carcinogenicity              475
metabolism_toxicity          434
hematological_toxicity       363
musculoskeletal_toxicity     324
immune_system_toxicity       256
infections                   206
dermatological_toxicity      166
nephrotoxicity               122
Name: toxicity_class, dtype: int64

In [330]:
toxicity.shape

(18687, 15)

In [331]:
toxicity['mislabelled'].value_counts()

not_mislabelled    15478
mislabelled          175
Name: mislabelled, dtype: int64

In [332]:
toxicity['toxicity_class'].isnull().sum()

4332