In [1]:
from platform import python_version

print(python_version())

3.8.10


In [7]:
import torch

#### Torch and cuda version

In [8]:
torch.__version__

'1.13.0+cu117'

In [9]:
torch.cuda.is_available()

True

In [10]:
print(torch.version.cuda)

11.7


In [11]:
# Check the device

device = 'cuda' if torch.cuda.is_available() else 'cpu'
device

'cuda'

In [12]:
from rdkit import Chem

In [13]:
import deepchem as dc

2023-07-14 14:49:31.514694: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
Skipped loading some Jax models, missing a dependency. No module named 'jax'


### Installation of packages

In [14]:
# Import packages and libraries

import pandas as pd
import numpy as np

from sklearn.preprocessing import StandardScaler

### Load Data 

In [15]:
# load all the dataset
covid_drugs_data = pd.read_csv('covid_bioactivity_data_without_intermediate_class_2.csv')
covid_drugs_data

Unnamed: 0,molecule_chembl_id,canonical_smiles,assay_description,assay_type,target_organism,target_pref_name,bioactivity_class,standard_value,"bioactivity_class_labels (0-inactive, 1-active, 2-intermediate)",MW,LogP,NumHDonors,NumHAcceptors
0,CHEMBL204499,NC(=O)c1ncn([C@@H]2C=C(CO)[C@@H](O)[C@H]2O)n1,Cytotoxicity against SARS coronavirus,A,SARS coronavirus,SARS coronavirus,inactive,100000,0,240.219,-2.4278,4.0,7.0
1,CHEMBL203308,NC(=O)c1cn([C@@H]2C=C(CO)[C@@H](O)[C@H]2O)nn1,Cytotoxicity against SARS coronavirus,A,SARS coronavirus,SARS coronavirus,inactive,100000,0,240.219,-2.4278,4.0,7.0
2,CHEMBL381539,NC(=O)c1cn([C@@H]2C=C(CO)[C@@H](O)[C@H]2O)cn1,Cytotoxicity against SARS coronavirus,A,SARS coronavirus,SARS coronavirus,inactive,100000,0,239.231,-1.8228,4.0,6.0
3,CHEMBL225045,Nc1nc(O)c2[nH]cc([C@@H]3C=C(CO)[C@@H](O)[C@H]3...,Antiviral activity against SARS coronavirus,F,SARS coronavirus,SARS coronavirus,inactive,100000,0,278.268,-1.0166,6.0,7.0
4,CHEMBL224363,O=c1[nH]cc([C@@H]2C=C(CO)[C@@H](O)[C@H]2O)c(=O...,Antiviral activity against SARS coronavirus,F,SARS coronavirus,SARS coronavirus,inactive,100000,0,240.215,-2.1991,5.0,5.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
735,"164621889 -- (5R,8S,11S,14S,17S,20S)-17-(3-am...",CC(C)CC1C(=O)NC(C(=O)NC(CC2=CC(=NC=C2)C3=NC(CS...,Binding affinity to SARS CoV-2 15N/2H-labeled ...,B,Replicase polyprotein 1ab (Severe acute respir...,ORF1ab - ORF1a polyprotein;ORF1ab polyprotein ...,inactive,,0,715.878,-0.9674,7.0,10.0
736,"164621889 -- (5R,8S,11S,14S,17S,20S)-17-(3-am...",CC(C)C[C@H]1C(=O)N[C@H](C(=O)N[C@@H](CC2=CC(=N...,Binding affinity to SARS CoV-2 15N/2H-labeled ...,B,Replicase polyprotein 1ab (Severe acute respir...,ORF1ab - ORF1a polyprotein;ORF1ab polyprotein ...,inactive,,0,715.878,-0.9674,7.0,10.0
737,"164624927 -- (5R,8S,11S,14S,17S,20S,23S,26S)-...",CC1C(=O)NC(C(=O)NC(C(=O)NC(C(=O)NC(C(=O)NC(CC2...,Binding affinity to SARS CoV-2 15N/2H-labeled ...,B,Replicase polyprotein 1ab (Severe acute respir...,ORF1ab - ORF1a polyprotein;ORF1ab polyprotein ...,inactive,,0,829.982,-2.5941,9.0,12.0
738,"164624927 -- (5R,8S,11S,14S,17S,20S,23S,26S)-...",C[C@H]1C(=O)N[C@H](C(=O)N[C@H](C(=O)N[C@H](C(=...,Binding affinity to SARS CoV-2 15N/2H-labeled ...,B,Replicase polyprotein 1ab (Severe acute respir...,ORF1ab - ORF1a polyprotein;ORF1ab polyprotein ...,inactive,,0,829.982,-2.5941,9.0,12.0


### Check info and description of the extracted data

In [16]:
covid_drugs_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 740 entries, 0 to 739
Data columns (total 13 columns):
 #   Column                                                           Non-Null Count  Dtype  
---  ------                                                           --------------  -----  
 0   molecule_chembl_id                                               740 non-null    object 
 1   canonical_smiles                                                 740 non-null    object 
 2   assay_description                                                740 non-null    object 
 3   assay_type                                                       740 non-null    object 
 4   target_organism                                                  740 non-null    object 
 5   target_pref_name                                                 740 non-null    object 
 6   bioactivity_class                                                740 non-null    object 
 7   standard_value                              

In [17]:
covid_drugs_data.describe()

Unnamed: 0,"bioactivity_class_labels (0-inactive, 1-active, 2-intermediate)",MW,LogP,NumHDonors,NumHAcceptors
count,740.0,740.0,740.0,740.0,740.0
mean,0.528378,410.874608,2.668118,2.659459,5.932432
std,0.499532,153.475589,2.305945,2.710223,3.255187
min,0.0,126.111,-8.35173,0.0,1.0
25%,0.0,302.458,1.504525,1.0,4.0
50%,1.0,379.3455,3.0143,2.0,5.0
75%,1.0,482.81475,4.1374,4.0,7.0
max,1.0,1468.68,7.6211,23.0,22.0


In [18]:
comp1_unique = covid_drugs_data['canonical_smiles'].unique()

In [19]:
comp1_unique

array(['NC(=O)c1ncn([C@@H]2C=C(CO)[C@@H](O)[C@H]2O)n1',
       'NC(=O)c1cn([C@@H]2C=C(CO)[C@@H](O)[C@H]2O)nn1',
       'NC(=O)c1cn([C@@H]2C=C(CO)[C@@H](O)[C@H]2O)cn1',
       'Nc1nc(O)c2[nH]cc([C@@H]3C=C(CO)[C@@H](O)[C@H]3O)c2n1',
       'O=c1[nH]cc([C@@H]2C=C(CO)[C@@H](O)[C@H]2O)c(=O)[nH]1',
       'Nc1nc(=O)c([C@@H]2C=C(CO)[C@@H](O)[C@H]2O)c[nH]1',
       'OC[C@H]1O[C@@H](n2cnc3c(Cl)ncnc32)[C@H](O)[C@@H]1O',
       'NC(=O)c1ncn([C@@H]2O[C@H](CO)[C@@H](O)[C@H]2O)c1O',
       'NC(=O)c1ncn([C@@H]2O[C@H](CO)[C@@H](O)[C@H]2O)n1',
       'OCC1C(COCc2ccccc2)CC1n1cnc2c(Cl)ncnc21',
       'OCC1CC(n2cnc3c(Cl)ncnc32)C1CO',
       'OCC(CCn1cnc2c(Cl)ncnc21)COCc1ccccc1',
       'OCC(CO)CCn1cnc2c(Cl)ncnc21',
       'OC[C@@H]1C[C@@H](O)[C@H](n2cnc3c(Cl)ncnc32)O1',
       'OC[C@H]1O[C@@H](n2cnc3c(Cl)ncnc32)C[C@@H]1O',
       'Nc1nc(Cl)c2ncn([C@@H]3O[C@H](CO)[C@@H](O)[C@H]3O)c2n1',
       'CSc1ncnc2c1ncn2[C@@H]1O[C@H](CO)[C@@H](O)[C@H]1O',
       'COc1ncnc2c1ncn2[C@@H]1O[C@H](CO)[C@@H](O)[C@H]1O',
   

In [20]:
len(comp1_unique)

608

In [21]:
comp2_unique = covid_drugs_data['target_organism'].unique()

In [22]:
comp2_unique

array(['SARS coronavirus',
       'Middle East respiratory syndrome-related coronavirus',
       'Severe acute respiratory syndrome coronavirus 2',
       'ORF1ab - ORF1a polyprotein;ORF1ab polyprotein (Betacoronavirus England 1)',
       ' surface glycoprotein (Severe acute respiratory syndrome coronavirus 2)',
       'Replicase polyprotein 1ab (Severe acute respiratory syndrome coronavirus 2)'],
      dtype=object)

In [23]:
len(comp2_unique)

6

In [25]:
covid_drugs_data.head()

Unnamed: 0,molecule_chembl_id,canonical_smiles,assay_description,assay_type,target_organism,target_pref_name,bioactivity_class,standard_value,"bioactivity_class_labels (0-inactive, 1-active, 2-intermediate)",MW,LogP,NumHDonors,NumHAcceptors
0,CHEMBL204499,NC(=O)c1ncn([C@@H]2C=C(CO)[C@@H](O)[C@H]2O)n1,Cytotoxicity against SARS coronavirus,A,SARS coronavirus,SARS coronavirus,inactive,100000,0,240.219,-2.4278,4.0,7.0
1,CHEMBL203308,NC(=O)c1cn([C@@H]2C=C(CO)[C@@H](O)[C@H]2O)nn1,Cytotoxicity against SARS coronavirus,A,SARS coronavirus,SARS coronavirus,inactive,100000,0,240.219,-2.4278,4.0,7.0
2,CHEMBL381539,NC(=O)c1cn([C@@H]2C=C(CO)[C@@H](O)[C@H]2O)cn1,Cytotoxicity against SARS coronavirus,A,SARS coronavirus,SARS coronavirus,inactive,100000,0,239.231,-1.8228,4.0,6.0
3,CHEMBL225045,Nc1nc(O)c2[nH]cc([C@@H]3C=C(CO)[C@@H](O)[C@H]3...,Antiviral activity against SARS coronavirus,F,SARS coronavirus,SARS coronavirus,inactive,100000,0,278.268,-1.0166,6.0,7.0
4,CHEMBL224363,O=c1[nH]cc([C@@H]2C=C(CO)[C@@H](O)[C@H]2O)c(=O...,Antiviral activity against SARS coronavirus,F,SARS coronavirus,SARS coronavirus,inactive,100000,0,240.215,-2.1991,5.0,5.0


In [26]:
covid_drugs_data.tail()

Unnamed: 0,molecule_chembl_id,canonical_smiles,assay_description,assay_type,target_organism,target_pref_name,bioactivity_class,standard_value,"bioactivity_class_labels (0-inactive, 1-active, 2-intermediate)",MW,LogP,NumHDonors,NumHAcceptors
735,"164621889 -- (5R,8S,11S,14S,17S,20S)-17-(3-am...",CC(C)CC1C(=O)NC(C(=O)NC(CC2=CC(=NC=C2)C3=NC(CS...,Binding affinity to SARS CoV-2 15N/2H-labeled ...,B,Replicase polyprotein 1ab (Severe acute respir...,ORF1ab - ORF1a polyprotein;ORF1ab polyprotein ...,inactive,,0,715.878,-0.9674,7.0,10.0
736,"164621889 -- (5R,8S,11S,14S,17S,20S)-17-(3-am...",CC(C)C[C@H]1C(=O)N[C@H](C(=O)N[C@@H](CC2=CC(=N...,Binding affinity to SARS CoV-2 15N/2H-labeled ...,B,Replicase polyprotein 1ab (Severe acute respir...,ORF1ab - ORF1a polyprotein;ORF1ab polyprotein ...,inactive,,0,715.878,-0.9674,7.0,10.0
737,"164624927 -- (5R,8S,11S,14S,17S,20S,23S,26S)-...",CC1C(=O)NC(C(=O)NC(C(=O)NC(C(=O)NC(C(=O)NC(CC2...,Binding affinity to SARS CoV-2 15N/2H-labeled ...,B,Replicase polyprotein 1ab (Severe acute respir...,ORF1ab - ORF1a polyprotein;ORF1ab polyprotein ...,inactive,,0,829.982,-2.5941,9.0,12.0
738,"164624927 -- (5R,8S,11S,14S,17S,20S,23S,26S)-...",C[C@H]1C(=O)N[C@H](C(=O)N[C@H](C(=O)N[C@H](C(=...,Binding affinity to SARS CoV-2 15N/2H-labeled ...,B,Replicase polyprotein 1ab (Severe acute respir...,ORF1ab - ORF1a polyprotein;ORF1ab polyprotein ...,inactive,,0,829.982,-2.5941,9.0,12.0
739,164628593 -- 2-(1-methylindol-3-yl)sulfanyl-N...,CN1C=C(C2=CC=CC=C21)SCC(=O)NCC3=CC=C(C=C3)C(F)...,Inhibition of SARS-CoV-2 RdRp transfected in h...,B,Replicase polyprotein 1ab (Severe acute respir...,ORF1ab - ORF1a polyprotein;ORF1ab polyprotein ...,inactive,,0,378.419,4.6056,1.0,3.0


In [27]:
comp1_index_list = [i+1 for i in range(len(comp1_unique))]
comp1_index_list

[1,
 2,
 3,
 4,
 5,
 6,
 7,
 8,
 9,
 10,
 11,
 12,
 13,
 14,
 15,
 16,
 17,
 18,
 19,
 20,
 21,
 22,
 23,
 24,
 25,
 26,
 27,
 28,
 29,
 30,
 31,
 32,
 33,
 34,
 35,
 36,
 37,
 38,
 39,
 40,
 41,
 42,
 43,
 44,
 45,
 46,
 47,
 48,
 49,
 50,
 51,
 52,
 53,
 54,
 55,
 56,
 57,
 58,
 59,
 60,
 61,
 62,
 63,
 64,
 65,
 66,
 67,
 68,
 69,
 70,
 71,
 72,
 73,
 74,
 75,
 76,
 77,
 78,
 79,
 80,
 81,
 82,
 83,
 84,
 85,
 86,
 87,
 88,
 89,
 90,
 91,
 92,
 93,
 94,
 95,
 96,
 97,
 98,
 99,
 100,
 101,
 102,
 103,
 104,
 105,
 106,
 107,
 108,
 109,
 110,
 111,
 112,
 113,
 114,
 115,
 116,
 117,
 118,
 119,
 120,
 121,
 122,
 123,
 124,
 125,
 126,
 127,
 128,
 129,
 130,
 131,
 132,
 133,
 134,
 135,
 136,
 137,
 138,
 139,
 140,
 141,
 142,
 143,
 144,
 145,
 146,
 147,
 148,
 149,
 150,
 151,
 152,
 153,
 154,
 155,
 156,
 157,
 158,
 159,
 160,
 161,
 162,
 163,
 164,
 165,
 166,
 167,
 168,
 169,
 170,
 171,
 172,
 173,
 174,
 175,
 176,
 177,
 178,
 179,
 180,
 181,
 182,
 183,
 184,
 185

In [28]:
len(comp1_index_list)

608

In [29]:
starts = len(comp1_unique)
ends = len(comp2_unique) + starts
comp2_index_list = [k+1 for k in range(starts, ends)]
comp2_index_list

[609, 610, 611, 612, 613, 614]

In [30]:
len(comp2_index_list)

6

In [31]:
covid_drugs_data['canonical_smiles'].where(covid_drugs_data['canonical_smiles'] == comp1_unique[0], 1)

0      NC(=O)c1ncn([C@@H]2C=C(CO)[C@@H](O)[C@H]2O)n1
1                                                  1
2                                                  1
3                                                  1
4                                                  1
                           ...                      
735                                                1
736                                                1
737                                                1
738                                                1
739                                                1
Name: canonical_smiles, Length: 740, dtype: object

In [32]:
covid_drugs_data['target_organism'].where(covid_drugs_data['target_organism'] == comp2_unique[0], 2)

0      SARS coronavirus
1      SARS coronavirus
2      SARS coronavirus
3      SARS coronavirus
4      SARS coronavirus
             ...       
735                   2
736                   2
737                   2
738                   2
739                   2
Name: target_organism, Length: 740, dtype: object

In [33]:
for index, comp_name in enumerate(comp1_unique):
    covid_drugs_data.loc[(covid_drugs_data['canonical_smiles'] == comp_name), 'canonical_smiles'] = comp1_index_list[index]

In [34]:
for index, comp_name in enumerate(comp2_unique):
    covid_drugs_data.loc[(covid_drugs_data['target_organism'] == comp_name), 'target_organism'] = comp2_index_list[index]

In [36]:
covid_drugs_data.head()

Unnamed: 0,molecule_chembl_id,canonical_smiles,assay_description,assay_type,target_organism,target_pref_name,bioactivity_class,standard_value,"bioactivity_class_labels (0-inactive, 1-active, 2-intermediate)",MW,LogP,NumHDonors,NumHAcceptors
0,CHEMBL204499,1,Cytotoxicity against SARS coronavirus,A,609,SARS coronavirus,inactive,100000,0,240.219,-2.4278,4.0,7.0
1,CHEMBL203308,2,Cytotoxicity against SARS coronavirus,A,609,SARS coronavirus,inactive,100000,0,240.219,-2.4278,4.0,7.0
2,CHEMBL381539,3,Cytotoxicity against SARS coronavirus,A,609,SARS coronavirus,inactive,100000,0,239.231,-1.8228,4.0,6.0
3,CHEMBL225045,4,Antiviral activity against SARS coronavirus,F,609,SARS coronavirus,inactive,100000,0,278.268,-1.0166,6.0,7.0
4,CHEMBL224363,5,Antiviral activity against SARS coronavirus,F,609,SARS coronavirus,inactive,100000,0,240.215,-2.1991,5.0,5.0


In [37]:
covid_drugs_data.tail()

Unnamed: 0,molecule_chembl_id,canonical_smiles,assay_description,assay_type,target_organism,target_pref_name,bioactivity_class,standard_value,"bioactivity_class_labels (0-inactive, 1-active, 2-intermediate)",MW,LogP,NumHDonors,NumHAcceptors
735,"164621889 -- (5R,8S,11S,14S,17S,20S)-17-(3-am...",604,Binding affinity to SARS CoV-2 15N/2H-labeled ...,B,614,ORF1ab - ORF1a polyprotein;ORF1ab polyprotein ...,inactive,,0,715.878,-0.9674,7.0,10.0
736,"164621889 -- (5R,8S,11S,14S,17S,20S)-17-(3-am...",605,Binding affinity to SARS CoV-2 15N/2H-labeled ...,B,614,ORF1ab - ORF1a polyprotein;ORF1ab polyprotein ...,inactive,,0,715.878,-0.9674,7.0,10.0
737,"164624927 -- (5R,8S,11S,14S,17S,20S,23S,26S)-...",606,Binding affinity to SARS CoV-2 15N/2H-labeled ...,B,614,ORF1ab - ORF1a polyprotein;ORF1ab polyprotein ...,inactive,,0,829.982,-2.5941,9.0,12.0
738,"164624927 -- (5R,8S,11S,14S,17S,20S,23S,26S)-...",607,Binding affinity to SARS CoV-2 15N/2H-labeled ...,B,614,ORF1ab - ORF1a polyprotein;ORF1ab polyprotein ...,inactive,,0,829.982,-2.5941,9.0,12.0
739,164628593 -- 2-(1-methylindol-3-yl)sulfanyl-N...,608,Inhibition of SARS-CoV-2 RdRp transfected in h...,B,614,ORF1ab - ORF1a polyprotein;ORF1ab polyprotein ...,inactive,,0,378.419,4.6056,1.0,3.0


### Creating source nodes and destination nodes

In [38]:
# Source nodes
sourcenode = covid_drugs_data['canonical_smiles'].to_list()
sourcenode

[1,
 2,
 3,
 4,
 5,
 6,
 7,
 8,
 9,
 10,
 11,
 12,
 13,
 14,
 15,
 16,
 17,
 18,
 19,
 20,
 21,
 22,
 23,
 24,
 25,
 26,
 27,
 28,
 29,
 30,
 31,
 32,
 33,
 34,
 35,
 36,
 37,
 38,
 39,
 40,
 41,
 42,
 24,
 23,
 43,
 44,
 45,
 46,
 47,
 48,
 49,
 50,
 51,
 51,
 52,
 52,
 53,
 53,
 54,
 54,
 55,
 55,
 56,
 56,
 57,
 57,
 58,
 58,
 59,
 59,
 60,
 60,
 61,
 61,
 62,
 62,
 63,
 63,
 64,
 64,
 65,
 65,
 66,
 66,
 67,
 67,
 68,
 68,
 69,
 69,
 70,
 70,
 71,
 71,
 72,
 72,
 73,
 73,
 74,
 74,
 75,
 75,
 76,
 76,
 77,
 77,
 78,
 79,
 80,
 81,
 82,
 83,
 84,
 85,
 86,
 87,
 88,
 89,
 90,
 91,
 92,
 93,
 94,
 95,
 96,
 97,
 98,
 99,
 100,
 101,
 102,
 103,
 104,
 105,
 106,
 107,
 108,
 109,
 110,
 105,
 107,
 110,
 105,
 107,
 108,
 110,
 104,
 105,
 106,
 107,
 108,
 109,
 110,
 111,
 112,
 113,
 114,
 115,
 116,
 117,
 118,
 119,
 120,
 121,
 122,
 123,
 124,
 125,
 126,
 127,
 128,
 129,
 130,
 131,
 132,
 133,
 134,
 135,
 136,
 137,
 138,
 139,
 140,
 141,
 142,
 143,
 144,
 145,
 146,
 14

In [39]:
# Target nodes
destinnode = covid_drugs_data['target_organism'].to_list()
destinnode

[609,
 609,
 609,
 609,
 609,
 609,
 609,
 609,
 609,
 609,
 609,
 609,
 609,
 609,
 609,
 609,
 609,
 609,
 609,
 609,
 609,
 609,
 609,
 609,
 609,
 609,
 609,
 609,
 609,
 609,
 609,
 609,
 609,
 609,
 609,
 609,
 609,
 609,
 609,
 609,
 609,
 609,
 609,
 609,
 609,
 609,
 609,
 609,
 609,
 609,
 609,
 609,
 609,
 609,
 609,
 609,
 609,
 609,
 609,
 609,
 609,
 609,
 609,
 609,
 609,
 609,
 609,
 609,
 609,
 609,
 609,
 609,
 609,
 609,
 609,
 609,
 609,
 609,
 609,
 609,
 609,
 609,
 609,
 609,
 609,
 609,
 609,
 609,
 609,
 609,
 609,
 609,
 609,
 609,
 609,
 609,
 609,
 609,
 609,
 609,
 609,
 609,
 609,
 609,
 609,
 609,
 609,
 609,
 609,
 609,
 609,
 609,
 609,
 609,
 609,
 609,
 609,
 609,
 609,
 609,
 609,
 609,
 609,
 609,
 609,
 609,
 609,
 609,
 609,
 609,
 609,
 609,
 609,
 609,
 609,
 609,
 609,
 609,
 609,
 609,
 609,
 609,
 609,
 609,
 609,
 609,
 609,
 609,
 609,
 609,
 609,
 609,
 609,
 610,
 610,
 610,
 610,
 610,
 610,
 610,
 610,
 610,
 610,
 610,
 610,
 610,
 610

In [40]:
len(sourcenode)

740

In [41]:
len(destinnode)

740

### Concatenate sourcenode and destinnode

In [42]:
source_sd = sourcenode + destinnode
source_sd

[1,
 2,
 3,
 4,
 5,
 6,
 7,
 8,
 9,
 10,
 11,
 12,
 13,
 14,
 15,
 16,
 17,
 18,
 19,
 20,
 21,
 22,
 23,
 24,
 25,
 26,
 27,
 28,
 29,
 30,
 31,
 32,
 33,
 34,
 35,
 36,
 37,
 38,
 39,
 40,
 41,
 42,
 24,
 23,
 43,
 44,
 45,
 46,
 47,
 48,
 49,
 50,
 51,
 51,
 52,
 52,
 53,
 53,
 54,
 54,
 55,
 55,
 56,
 56,
 57,
 57,
 58,
 58,
 59,
 59,
 60,
 60,
 61,
 61,
 62,
 62,
 63,
 63,
 64,
 64,
 65,
 65,
 66,
 66,
 67,
 67,
 68,
 68,
 69,
 69,
 70,
 70,
 71,
 71,
 72,
 72,
 73,
 73,
 74,
 74,
 75,
 75,
 76,
 76,
 77,
 77,
 78,
 79,
 80,
 81,
 82,
 83,
 84,
 85,
 86,
 87,
 88,
 89,
 90,
 91,
 92,
 93,
 94,
 95,
 96,
 97,
 98,
 99,
 100,
 101,
 102,
 103,
 104,
 105,
 106,
 107,
 108,
 109,
 110,
 105,
 107,
 110,
 105,
 107,
 108,
 110,
 104,
 105,
 106,
 107,
 108,
 109,
 110,
 111,
 112,
 113,
 114,
 115,
 116,
 117,
 118,
 119,
 120,
 121,
 122,
 123,
 124,
 125,
 126,
 127,
 128,
 129,
 130,
 131,
 132,
 133,
 134,
 135,
 136,
 137,
 138,
 139,
 140,
 141,
 142,
 143,
 144,
 145,
 146,
 14

In [43]:
len(source_sd)

1480

In [44]:
source_ds = destinnode + sourcenode
source_ds

[609,
 609,
 609,
 609,
 609,
 609,
 609,
 609,
 609,
 609,
 609,
 609,
 609,
 609,
 609,
 609,
 609,
 609,
 609,
 609,
 609,
 609,
 609,
 609,
 609,
 609,
 609,
 609,
 609,
 609,
 609,
 609,
 609,
 609,
 609,
 609,
 609,
 609,
 609,
 609,
 609,
 609,
 609,
 609,
 609,
 609,
 609,
 609,
 609,
 609,
 609,
 609,
 609,
 609,
 609,
 609,
 609,
 609,
 609,
 609,
 609,
 609,
 609,
 609,
 609,
 609,
 609,
 609,
 609,
 609,
 609,
 609,
 609,
 609,
 609,
 609,
 609,
 609,
 609,
 609,
 609,
 609,
 609,
 609,
 609,
 609,
 609,
 609,
 609,
 609,
 609,
 609,
 609,
 609,
 609,
 609,
 609,
 609,
 609,
 609,
 609,
 609,
 609,
 609,
 609,
 609,
 609,
 609,
 609,
 609,
 609,
 609,
 609,
 609,
 609,
 609,
 609,
 609,
 609,
 609,
 609,
 609,
 609,
 609,
 609,
 609,
 609,
 609,
 609,
 609,
 609,
 609,
 609,
 609,
 609,
 609,
 609,
 609,
 609,
 609,
 609,
 609,
 609,
 609,
 609,
 609,
 609,
 609,
 609,
 609,
 609,
 609,
 609,
 610,
 610,
 610,
 610,
 610,
 610,
 610,
 610,
 610,
 610,
 610,
 610,
 610,
 610

In [45]:
len(source_ds)

1480

### Edge index and type

In [46]:
edge_index = torch.tensor([source_sd, source_ds], dtype=torch.long)

In [47]:
edge_index

tensor([[  1,   2,   3,  ..., 614, 614, 614],
        [609, 609, 609,  ..., 606, 607, 608]])

In [48]:
len(edge_index)

2

In [49]:
type(edge_index)

torch.Tensor

In [50]:
edge_index.shape

torch.Size([2, 1480])

In [51]:
relation_list = covid_drugs_data['bioactivity_class_labels (0-inactive, 1-active, 2-intermediate)' ].to_list() 

In [52]:
relation_list

[0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 0,
 0,
 1,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 1,
 1,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 1,
 1,
 1,
 1,
 1,
 1,
 0,
 1,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,


In [53]:
len(relation_list)

740

In [54]:
type(relation_list)

list

In [55]:
edge_type = torch.tensor(relation_list*2, dtype=torch.long)

In [56]:
len(edge_type)

1480

In [57]:
type(edge_type)

torch.Tensor

In [58]:
edge_type.shape

torch.Size([1480])

### Check the ratio number of targets 

In [59]:
covid_drugs_data.iloc[:, 8]

0      0
1      0
2      0
3      0
4      0
      ..
735    0
736    0
737    0
738    0
739    0
Name: bioactivity_class_labels (0-inactive, 1-active, 2-intermediate), Length: 740, dtype: int64

In [60]:
print("The number of zeros ('0') targets in nv_data is: ", sum(covid_drugs_data.iloc[:, 8] == 0))

The number of zeros ('0') targets in nv_data is:  349


In [61]:
print("The number of ones ('1') targets in nv_data is: ", sum(covid_drugs_data.iloc[:, 8] == 1))

The number of ones ('1') targets in nv_data is:  391


In [62]:
type(comp1_unique)

numpy.ndarray

## Dateset 2 for feature extraction

In [63]:
# load all the dataset
covid_drugs_data_updated = pd.read_csv('covid_bioactivity_data_without_intermediate_class_2_updated.csv')
covid_drugs_data_updated

Unnamed: 0,molecule_chembl_id,canonical_smiles,assay_description,assay_type,target_organism,target_pref_name,bioactivity_class,standard_value,target_organism_canonical_smiles,target_organism_isomeric_smiles,"bioactivity_class_labels (0-inactive, 1-active, 2-intermediate)",MW,LogP,NumHDonors,NumHAcceptors
0,CHEMBL204499,NC(=O)c1ncn([C@@H]2C=C(CO)[C@@H](O)[C@H]2O)n1,Cytotoxicity against SARS coronavirus,A,SARS coronavirus,SARS coronavirus,inactive,100000,CC(C)CC(C(=O)NC(CC1CCNC1=O)C(O)S(=O)(=O)[O-])N...,CC(C)C[C@H](C(=O)N[C@H](CC1CCNC1=O)C(O)S(=O)(=...,0,240.219,-2.4278,4,7
1,CHEMBL203308,NC(=O)c1cn([C@@H]2C=C(CO)[C@@H](O)[C@H]2O)nn1,Cytotoxicity against SARS coronavirus,A,SARS coronavirus,SARS coronavirus,inactive,100000,CC(C)CC(C(=O)NC(CC1CCNC1=O)C(O)S(=O)(=O)[O-])N...,CC(C)C[C@H](C(=O)N[C@H](CC1CCNC1=O)C(O)S(=O)(=...,0,240.219,-2.4278,4,7
2,CHEMBL381539,NC(=O)c1cn([C@@H]2C=C(CO)[C@@H](O)[C@H]2O)cn1,Cytotoxicity against SARS coronavirus,A,SARS coronavirus,SARS coronavirus,inactive,100000,CC(C)CC(C(=O)NC(CC1CCNC1=O)C(O)S(=O)(=O)[O-])N...,CC(C)C[C@H](C(=O)N[C@H](CC1CCNC1=O)C(O)S(=O)(=...,0,239.231,-1.8228,4,6
3,CHEMBL225045,Nc1nc(O)c2[nH]cc([C@@H]3C=C(CO)[C@@H](O)[C@H]3...,Antiviral activity against SARS coronavirus,F,SARS coronavirus,SARS coronavirus,inactive,100000,CC(C)CC(C(=O)NC(CC1CCNC1=O)C(O)S(=O)(=O)[O-])N...,CC(C)C[C@H](C(=O)N[C@H](CC1CCNC1=O)C(O)S(=O)(=...,0,278.268,-1.0166,6,7
4,CHEMBL224363,O=c1[nH]cc([C@@H]2C=C(CO)[C@@H](O)[C@H]2O)c(=O...,Antiviral activity against SARS coronavirus,F,SARS coronavirus,SARS coronavirus,inactive,100000,CC(C)CC(C(=O)NC(CC1CCNC1=O)C(O)S(=O)(=O)[O-])N...,CC(C)C[C@H](C(=O)N[C@H](CC1CCNC1=O)C(O)S(=O)(=...,0,240.215,-2.1991,5,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
735,"164621889 -- (5R,8S,11S,14S,17S,20S)-17-(3-am...",CC(C)CC1C(=O)NC(C(=O)NC(CC2=CC(=NC=C2)C3=NC(CS...,Binding affinity to SARS CoV-2 15N/2H-labeled ...,B,Replicase polyprotein 1ab (Severe acute respir...,ORF1ab - ORF1a polyprotein;ORF1ab polyprotein ...,inactive,,CC(C)CC(C(=O)NC(CC1CCNC1=O)C(O)S(=O)(=O)[O-])N...,CC(C)C[C@H](C(=O)N[C@H](CC1CCNC1=O)C(O)S(=O)(=...,0,715.878,-0.9674,7,10
736,"164621889 -- (5R,8S,11S,14S,17S,20S)-17-(3-am...",CC(C)C[C@H]1C(=O)N[C@H](C(=O)N[C@@H](CC2=CC(=N...,Binding affinity to SARS CoV-2 15N/2H-labeled ...,B,Replicase polyprotein 1ab (Severe acute respir...,ORF1ab - ORF1a polyprotein;ORF1ab polyprotein ...,inactive,,CC(C)CC(C(=O)NC(CC1CCNC1=O)C(O)S(=O)(=O)[O-])N...,CC(C)C[C@H](C(=O)N[C@H](CC1CCNC1=O)C(O)S(=O)(=...,0,715.878,-0.9674,7,10
737,"164624927 -- (5R,8S,11S,14S,17S,20S,23S,26S)-...",CC1C(=O)NC(C(=O)NC(C(=O)NC(C(=O)NC(C(=O)NC(CC2...,Binding affinity to SARS CoV-2 15N/2H-labeled ...,B,Replicase polyprotein 1ab (Severe acute respir...,ORF1ab - ORF1a polyprotein;ORF1ab polyprotein ...,inactive,,CC(C)CC(C(=O)NC(CC1CCNC1=O)C(O)S(=O)(=O)[O-])N...,CC(C)C[C@H](C(=O)N[C@H](CC1CCNC1=O)C(O)S(=O)(=...,0,829.982,-2.5941,9,12
738,"164624927 -- (5R,8S,11S,14S,17S,20S,23S,26S)-...",C[C@H]1C(=O)N[C@H](C(=O)N[C@H](C(=O)N[C@H](C(=...,Binding affinity to SARS CoV-2 15N/2H-labeled ...,B,Replicase polyprotein 1ab (Severe acute respir...,ORF1ab - ORF1a polyprotein;ORF1ab polyprotein ...,inactive,,CC(C)CC(C(=O)NC(CC1CCNC1=O)C(O)S(=O)(=O)[O-])N...,CC(C)C[C@H](C(=O)N[C@H](CC1CCNC1=O)C(O)S(=O)(=...,0,829.982,-2.5941,9,12


#### drug chemical compound

In [64]:
drug_smiles_vect_corpus = covid_drugs_data_updated['canonical_smiles'].to_list()
drug_smiles_vect_corpus

['NC(=O)c1ncn([C@@H]2C=C(CO)[C@@H](O)[C@H]2O)n1',
 'NC(=O)c1cn([C@@H]2C=C(CO)[C@@H](O)[C@H]2O)nn1',
 'NC(=O)c1cn([C@@H]2C=C(CO)[C@@H](O)[C@H]2O)cn1',
 'Nc1nc(O)c2[nH]cc([C@@H]3C=C(CO)[C@@H](O)[C@H]3O)c2n1',
 'O=c1[nH]cc([C@@H]2C=C(CO)[C@@H](O)[C@H]2O)c(=O)[nH]1',
 'Nc1nc(=O)c([C@@H]2C=C(CO)[C@@H](O)[C@H]2O)c[nH]1',
 'OC[C@H]1O[C@@H](n2cnc3c(Cl)ncnc32)[C@H](O)[C@@H]1O',
 'NC(=O)c1ncn([C@@H]2O[C@H](CO)[C@@H](O)[C@H]2O)c1O',
 'NC(=O)c1ncn([C@@H]2O[C@H](CO)[C@@H](O)[C@H]2O)n1',
 'OCC1C(COCc2ccccc2)CC1n1cnc2c(Cl)ncnc21',
 'OCC1CC(n2cnc3c(Cl)ncnc32)C1CO',
 'OCC(CCn1cnc2c(Cl)ncnc21)COCc1ccccc1',
 'OCC(CO)CCn1cnc2c(Cl)ncnc21',
 'OC[C@@H]1C[C@@H](O)[C@H](n2cnc3c(Cl)ncnc32)O1',
 'OC[C@H]1O[C@@H](n2cnc3c(Cl)ncnc32)C[C@@H]1O',
 'Nc1nc(Cl)c2ncn([C@@H]3O[C@H](CO)[C@@H](O)[C@H]3O)c2n1',
 'CSc1ncnc2c1ncn2[C@@H]1O[C@H](CO)[C@@H](O)[C@H]1O',
 'COc1ncnc2c1ncn2[C@@H]1O[C@H](CO)[C@@H](O)[C@H]1O',
 'O[C@@H]1[C@@H](COCc2ccccc2)O[C@@H](n2cnc3c(Cl)ncnc32)[C@@H]1O',
 'CCOC(=O)/C=C/[C@H](C[C@@H]1CCNC1=O)NC(=O)[C

In [65]:
type(drug_smiles_vect_corpus)

list

In [66]:
len(drug_smiles_vect_corpus)

740

#### target chemical compound

In [67]:
target_smiles_vect_corpus = covid_drugs_data_updated['target_organism_canonical_smiles'].to_list()
target_smiles_vect_corpus

['CC(C)CC(C(=O)NC(CC1CCNC1=O)C(O)S(=O)(=O)[O-])NC(=O)OCC2CCC(CC2)(F)F',
 'CC(C)CC(C(=O)NC(CC1CCNC1=O)C(O)S(=O)(=O)[O-])NC(=O)OCC2CCC(CC2)(F)F',
 'CC(C)CC(C(=O)NC(CC1CCNC1=O)C(O)S(=O)(=O)[O-])NC(=O)OCC2CCC(CC2)(F)F',
 'CC(C)CC(C(=O)NC(CC1CCNC1=O)C(O)S(=O)(=O)[O-])NC(=O)OCC2CCC(CC2)(F)F',
 'CC(C)CC(C(=O)NC(CC1CCNC1=O)C(O)S(=O)(=O)[O-])NC(=O)OCC2CCC(CC2)(F)F',
 'CC(C)CC(C(=O)NC(CC1CCNC1=O)C(O)S(=O)(=O)[O-])NC(=O)OCC2CCC(CC2)(F)F',
 'CC(C)CC(C(=O)NC(CC1CCNC1=O)C(O)S(=O)(=O)[O-])NC(=O)OCC2CCC(CC2)(F)F',
 'CC(C)CC(C(=O)NC(CC1CCNC1=O)C(O)S(=O)(=O)[O-])NC(=O)OCC2CCC(CC2)(F)F',
 'CC(C)CC(C(=O)NC(CC1CCNC1=O)C(O)S(=O)(=O)[O-])NC(=O)OCC2CCC(CC2)(F)F',
 'CC(C)CC(C(=O)NC(CC1CCNC1=O)C(O)S(=O)(=O)[O-])NC(=O)OCC2CCC(CC2)(F)F',
 'CC(C)CC(C(=O)NC(CC1CCNC1=O)C(O)S(=O)(=O)[O-])NC(=O)OCC2CCC(CC2)(F)F',
 'CC(C)CC(C(=O)NC(CC1CCNC1=O)C(O)S(=O)(=O)[O-])NC(=O)OCC2CCC(CC2)(F)F',
 'CC(C)CC(C(=O)NC(CC1CCNC1=O)C(O)S(=O)(=O)[O-])NC(=O)OCC2CCC(CC2)(F)F',
 'CC(C)CC(C(=O)NC(CC1CCNC1=O)C(O)S(=O)(=O)[O-])NC(=O)OCC2CCC(CC2

In [68]:
type(target_smiles_vect_corpus)

list

In [69]:
len(target_smiles_vect_corpus)

740

### "TF-IDF" with analyzer="char_wb" seems more reasonable and informative on chemical compound SMILES data.

In [70]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [71]:
vectorizer_tfidf = TfidfVectorizer(analyzer='char_wb', norm='l2')
vectorizer_tfidf

### vectorize drug smiles 

In [72]:
drug_smiles_vect_features = vectorizer_tfidf.fit_transform(drug_smiles_vect_corpus)
drug_smiles_vect_features

<740x36 sparse matrix of type '<class 'numpy.float64'>'
	with 8931 stored elements in Compressed Sparse Row format>

In [73]:
drug_smiles_vect_features.toarray()

array([[0.09729017, 0.        , 0.19563355, ..., 0.        , 0.        ,
        0.        ],
       [0.09729017, 0.        , 0.19563355, ..., 0.        , 0.        ,
        0.        ],
       [0.0963129 , 0.        , 0.19366844, ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.03922663, 0.        , 0.37467002, ..., 0.        , 0.03670491,
        0.        ],
       [0.03189533, 0.        , 0.30464571, ..., 0.        , 0.02984491,
        0.        ],
       [0.08083054, 0.        , 0.20317015, ..., 0.        , 0.07563429,
        0.        ]])

In [74]:
type(drug_smiles_vect_features)

scipy.sparse._csr.csr_matrix

In [75]:
(drug_smiles_vect_features.toarray()).shape

(740, 36)

### vectorize target smiles 

In [76]:
target_vectorizer_tfidf = TfidfVectorizer(analyzer='char_wb', norm='l2')
target_vectorizer_tfidf

In [77]:
target_smiles_vect_features = target_vectorizer_tfidf.fit_transform(target_smiles_vect_corpus)
target_smiles_vect_features

<740x14 sparse matrix of type '<class 'numpy.float64'>'
	with 10360 stored elements in Compressed Sparse Row format>

In [78]:
type(target_smiles_vect_features)

scipy.sparse._csr.csr_matrix

In [79]:
target_smiles_vect_features.toarray()

array([[0.0725954, 0.362977 , 0.362977 , ..., 0.1088931, 0.2903816,
        0.0362977],
       [0.0725954, 0.362977 , 0.362977 , ..., 0.1088931, 0.2903816,
        0.0362977],
       [0.0725954, 0.362977 , 0.362977 , ..., 0.1088931, 0.2903816,
        0.0362977],
       ...,
       [0.0725954, 0.362977 , 0.362977 , ..., 0.1088931, 0.2903816,
        0.0362977],
       [0.0725954, 0.362977 , 0.362977 , ..., 0.1088931, 0.2903816,
        0.0362977],
       [0.0725954, 0.362977 , 0.362977 , ..., 0.1088931, 0.2903816,
        0.0362977]])

In [80]:
(target_smiles_vect_features.toarray()).shape

(740, 14)

#### Feature of the vectorizers

In [81]:
vectorizer_tfidf.get_feature_names_out()

array([' ', '#', '(', ')', '+', '-', '.', '/', '1', '2', '3', '4', '5',
       '6', '7', '8', '=', '@', '[', '\\', ']', 'a', 'b', 'c', 'e', 'f',
       'g', 'h', 'i', 'l', 'n', 'o', 'p', 'r', 's', 'z'], dtype=object)

In [82]:
target_vectorizer_tfidf.get_feature_names_out()

array([' ', '(', ')', '-', '1', '2', '=', '[', ']', 'c', 'f', 'n', 'o',
       's'], dtype=object)

##### Sample check

In [83]:
check_target_smiles_vect_features = vectorizer_tfidf.fit_transform(target_smiles_vect_corpus)
check_target_smiles_vect_features

<740x14 sparse matrix of type '<class 'numpy.float64'>'
	with 10360 stored elements in Compressed Sparse Row format>

In [84]:
type(check_target_smiles_vect_features)

scipy.sparse._csr.csr_matrix

In [85]:
check_target_smiles_vect_features.toarray()

array([[0.0725954, 0.362977 , 0.362977 , ..., 0.1088931, 0.2903816,
        0.0362977],
       [0.0725954, 0.362977 , 0.362977 , ..., 0.1088931, 0.2903816,
        0.0362977],
       [0.0725954, 0.362977 , 0.362977 , ..., 0.1088931, 0.2903816,
        0.0362977],
       ...,
       [0.0725954, 0.362977 , 0.362977 , ..., 0.1088931, 0.2903816,
        0.0362977],
       [0.0725954, 0.362977 , 0.362977 , ..., 0.1088931, 0.2903816,
        0.0362977],
       [0.0725954, 0.362977 , 0.362977 , ..., 0.1088931, 0.2903816,
        0.0362977]])

In [86]:
(check_target_smiles_vect_features.toarray()).shape

(740, 14)

## Using isomeric smiles for targets

#### isomeric target chemical compound

In [87]:
isomeric_target_smiles_vect_corpus = covid_drugs_data_updated['target_organism_isomeric_smiles'].to_list()
isomeric_target_smiles_vect_corpus

['CC(C)C[C@H](C(=O)N[C@H](CC1CCNC1=O)C(O)S(=O)(=O)[O-])NC(=O)OCC2CCC(CC2)(F)F',
 'CC(C)C[C@H](C(=O)N[C@H](CC1CCNC1=O)C(O)S(=O)(=O)[O-])NC(=O)OCC2CCC(CC2)(F)F',
 'CC(C)C[C@H](C(=O)N[C@H](CC1CCNC1=O)C(O)S(=O)(=O)[O-])NC(=O)OCC2CCC(CC2)(F)F',
 'CC(C)C[C@H](C(=O)N[C@H](CC1CCNC1=O)C(O)S(=O)(=O)[O-])NC(=O)OCC2CCC(CC2)(F)F',
 'CC(C)C[C@H](C(=O)N[C@H](CC1CCNC1=O)C(O)S(=O)(=O)[O-])NC(=O)OCC2CCC(CC2)(F)F',
 'CC(C)C[C@H](C(=O)N[C@H](CC1CCNC1=O)C(O)S(=O)(=O)[O-])NC(=O)OCC2CCC(CC2)(F)F',
 'CC(C)C[C@H](C(=O)N[C@H](CC1CCNC1=O)C(O)S(=O)(=O)[O-])NC(=O)OCC2CCC(CC2)(F)F',
 'CC(C)C[C@H](C(=O)N[C@H](CC1CCNC1=O)C(O)S(=O)(=O)[O-])NC(=O)OCC2CCC(CC2)(F)F',
 'CC(C)C[C@H](C(=O)N[C@H](CC1CCNC1=O)C(O)S(=O)(=O)[O-])NC(=O)OCC2CCC(CC2)(F)F',
 'CC(C)C[C@H](C(=O)N[C@H](CC1CCNC1=O)C(O)S(=O)(=O)[O-])NC(=O)OCC2CCC(CC2)(F)F',
 'CC(C)C[C@H](C(=O)N[C@H](CC1CCNC1=O)C(O)S(=O)(=O)[O-])NC(=O)OCC2CCC(CC2)(F)F',
 'CC(C)C[C@H](C(=O)N[C@H](CC1CCNC1=O)C(O)S(=O)(=O)[O-])NC(=O)OCC2CCC(CC2)(F)F',
 'CC(C)C[C@H](C(=O)N[C@H](CC1CCNC1=O)C(O

In [88]:
type(isomeric_target_smiles_vect_corpus)

list

In [89]:
len(isomeric_target_smiles_vect_corpus)

740

In [90]:
#initialize vectorizer

isomeric_target_vectorizer_tfidf = TfidfVectorizer(analyzer='char_wb', norm='l2')
isomeric_target_vectorizer_tfidf

In [91]:
isomeric_target_smiles_vect_features = isomeric_target_vectorizer_tfidf.fit_transform(isomeric_target_smiles_vect_corpus)
isomeric_target_smiles_vect_features

<740x16 sparse matrix of type '<class 'numpy.float64'>'
	with 11840 stored elements in Compressed Sparse Row format>

In [92]:
type(isomeric_target_smiles_vect_features)

scipy.sparse._csr.csr_matrix

In [93]:
isomeric_target_smiles_vect_features.toarray()

array([[0.07147417, 0.35737084, 0.35737084, ..., 0.10721125, 0.28589668,
        0.03573708],
       [0.07147417, 0.35737084, 0.35737084, ..., 0.10721125, 0.28589668,
        0.03573708],
       [0.07147417, 0.35737084, 0.35737084, ..., 0.10721125, 0.28589668,
        0.03573708],
       ...,
       [0.07147417, 0.35737084, 0.35737084, ..., 0.10721125, 0.28589668,
        0.03573708],
       [0.07147417, 0.35737084, 0.35737084, ..., 0.10721125, 0.28589668,
        0.03573708],
       [0.07147417, 0.35737084, 0.35737084, ..., 0.10721125, 0.28589668,
        0.03573708]])

In [94]:
(isomeric_target_smiles_vect_features.toarray()).shape

(740, 16)

#### Feature of the vectorizers

In [95]:
vectorizer_tfidf.get_feature_names_out()

array([' ', '(', ')', '-', '1', '2', '=', '[', ']', 'c', 'f', 'n', 'o',
       's'], dtype=object)

In [96]:
target_vectorizer_tfidf.get_feature_names_out()

array([' ', '(', ')', '-', '1', '2', '=', '[', ']', 'c', 'f', 'n', 'o',
       's'], dtype=object)

In [97]:
isomeric_target_vectorizer_tfidf.get_feature_names_out()

array([' ', '(', ')', '-', '1', '2', '=', '@', '[', ']', 'c', 'f', 'h',
       'n', 'o', 's'], dtype=object)

### Nmpy array for isomeric targets

In [98]:
isomeric_targets_zeros = np.zeros((740,20))
isomeric_targets_zeros

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [99]:
isomeric_targets_zeros.shape

(740, 20)

In [100]:
type(isomeric_targets_zeros)

numpy.ndarray

### Final features for drug chemicals and targets

In [101]:
drug_smiles_vect_features = drug_smiles_vect_features.toarray()
drug_smiles_vect_features

array([[0.09729017, 0.        , 0.19563355, ..., 0.        , 0.        ,
        0.        ],
       [0.09729017, 0.        , 0.19563355, ..., 0.        , 0.        ,
        0.        ],
       [0.0963129 , 0.        , 0.19366844, ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.03922663, 0.        , 0.37467002, ..., 0.        , 0.03670491,
        0.        ],
       [0.03189533, 0.        , 0.30464571, ..., 0.        , 0.02984491,
        0.        ],
       [0.08083054, 0.        , 0.20317015, ..., 0.        , 0.07563429,
        0.        ]])

In [102]:
drug_smiles_vect_features.shape

(740, 36)

In [103]:
type(drug_smiles_vect_features)

numpy.ndarray

In [104]:
isomeric_target_smiles_vect_features = isomeric_target_smiles_vect_features.toarray()
isomeric_target_smiles_vect_features

array([[0.07147417, 0.35737084, 0.35737084, ..., 0.10721125, 0.28589668,
        0.03573708],
       [0.07147417, 0.35737084, 0.35737084, ..., 0.10721125, 0.28589668,
        0.03573708],
       [0.07147417, 0.35737084, 0.35737084, ..., 0.10721125, 0.28589668,
        0.03573708],
       ...,
       [0.07147417, 0.35737084, 0.35737084, ..., 0.10721125, 0.28589668,
        0.03573708],
       [0.07147417, 0.35737084, 0.35737084, ..., 0.10721125, 0.28589668,
        0.03573708],
       [0.07147417, 0.35737084, 0.35737084, ..., 0.10721125, 0.28589668,
        0.03573708]])

In [105]:
isomeric_target_smiles_vect_features.shape

(740, 16)

In [106]:
type(isomeric_target_smiles_vect_features)

numpy.ndarray

In [107]:
# Isomeric target features

final_target_features = np.concatenate((isomeric_target_smiles_vect_features, isomeric_targets_zeros), axis=1)
final_target_features

array([[0.07147417, 0.35737084, 0.35737084, ..., 0.        , 0.        ,
        0.        ],
       [0.07147417, 0.35737084, 0.35737084, ..., 0.        , 0.        ,
        0.        ],
       [0.07147417, 0.35737084, 0.35737084, ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.07147417, 0.35737084, 0.35737084, ..., 0.        , 0.        ,
        0.        ],
       [0.07147417, 0.35737084, 0.35737084, ..., 0.        , 0.        ,
        0.        ],
       [0.07147417, 0.35737084, 0.35737084, ..., 0.        , 0.        ,
        0.        ]])

In [108]:
final_target_features.shape

(740, 36)

In [109]:
type(final_target_features)

numpy.ndarray

### Final TF-idf features for drugs and targets

In [110]:
tfidf_final_features = np.concatenate((drug_smiles_vect_features, final_target_features), axis=1)
tfidf_final_features

array([[0.09729017, 0.        , 0.19563355, ..., 0.        , 0.        ,
        0.        ],
       [0.09729017, 0.        , 0.19563355, ..., 0.        , 0.        ,
        0.        ],
       [0.0963129 , 0.        , 0.19366844, ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.03922663, 0.        , 0.37467002, ..., 0.        , 0.        ,
        0.        ],
       [0.03189533, 0.        , 0.30464571, ..., 0.        , 0.        ,
        0.        ],
       [0.08083054, 0.        , 0.20317015, ..., 0.        , 0.        ,
        0.        ]])

In [111]:
tfidf_final_features.shape

(740, 72)

In [112]:
type(tfidf_final_features)

numpy.ndarray

### Standardize tfidf features

In [113]:
# Deploy the sklearn fuctionality to take its advantage of its preprocessing capabilities
from sklearn.preprocessing import StandardScaler

# Create the scaler object
scaler = StandardScaler()  
#scaler = preprocessing.MinMaxScaler()

In [114]:
tfidf_final_features_standardized = scaler.fit_transform(tfidf_final_features)
tfidf_final_features_standardized

array([[ 0.30459631, -0.23112801, -0.19399129, ...,  0.        ,
         0.        ,  0.        ],
       [ 0.30459631, -0.23112801, -0.19399129, ...,  0.        ,
         0.        ,  0.        ],
       [ 0.27651859, -0.23112801, -0.22103798, ...,  0.        ,
         0.        ,  0.        ],
       ...,
       [-1.36362035, -0.23112801,  2.27016487, ...,  0.        ,
         0.        ,  0.        ],
       [-1.57425498, -0.23112801,  1.30639003, ...,  0.        ,
         0.        ,  0.        ],
       [-0.16830322, -0.23112801, -0.0902619 , ...,  0.        ,
         0.        ,  0.        ]])

In [115]:
tfidf_final_features_standardized.shape

(740, 72)

In [116]:
type(tfidf_final_features_standardized)

numpy.ndarray

## Mordred and RDKIT individual drug's and target's atoms and compounds features resp.

### Step 1: Atom Featurisation

We start by defining an auxiliary function which transforms a value x into a one-hot encoding based on a list of permitted values for x:

In [117]:
def one_hot_encoding(x, permitted_list):
    """
    Maps input elements x which are not in the permitted list to the last element of the permitted list.
    """
    
    if x not in permitted_list:
        x = permitted_list[-1]
        
    binary_encoding = [int(boolean_value) for boolean_value in list(map(lambda s: x == s, permitted_list))]
    
    return binary_encoding

Now we use this auxiliary function to define the actual atom featurisation function:

In [118]:
def get_atom_features(atom, 
                      use_chirality = True, 
                      hydrogens_implicit = True):
    """
    Takes an RDKit atom object as input and gives a 1d-numpy array of atom features as output.
    """
    
    # define list of permitted atoms
    permitted_list_of_atoms =  ['C','N','O','S','F','Si','P','Cl','Br','Mg','Na','Ca','Fe','As','Al','I', 'B','V','K',
                                'Tl','Yb','Sb','Sn','Ag','Pd','Co','Se','Ti','Zn', 'Li','Ge','Cu','Au','Ni','Cd','In',
                                'Mn','Zr','Cr','Pt','Hg','Pb','Unknown']
    
    if hydrogens_implicit == False:
        permitted_list_of_atoms = ['H'] + permitted_list_of_atoms
        
    
    # compute atom features
    atom_type_enc = one_hot_encoding(str(atom.GetSymbol()), permitted_list_of_atoms)
    
    n_heavy_neighbors_enc = one_hot_encoding(int(atom.GetDegree()), [0, 1, 2, 3, 4, "MoreThanFour"])
    
    formal_charge_enc = one_hot_encoding(int(atom.GetFormalCharge()), [-3, -2, -1, 0, 1, 2, 3, "Extreme"])
    
    hybridisation_type_enc = one_hot_encoding(str(atom.GetHybridization()), ["S", "SP", "SP2", "SP3", "SP3D", "SP3D2",
                                                                             "OTHER"])
    
    is_in_a_ring_enc = [int(atom.IsInRing())]
    
    is_aromatic_enc = [int(atom.GetIsAromatic())]
    
    atomic_mass_scaled = [float((atom.GetMass() - 10.812)/116.092)]
    
    vdw_radius_scaled = [float((Chem.GetPeriodicTable().GetRvdw(atom.GetAtomicNum()) - 1.5)/0.6)]
    
    covalent_radius_scaled = [float((Chem.GetPeriodicTable().GetRcovalent(atom.GetAtomicNum()) - 0.64)/0.76)]
    atom_feature_vector = atom_type_enc + n_heavy_neighbors_enc + formal_charge_enc + hybridisation_type_enc + is_in_a_ring_enc + is_aromatic_enc + atomic_mass_scaled + vdw_radius_scaled + covalent_radius_scaled
                                    
    if use_chirality == True:
        chirality_type_enc = one_hot_encoding(str(atom.GetChiralTag()), ["CHI_UNSPECIFIED", "CHI_TETRAHEDRAL_CW", 
                                                                         "CHI_TETRAHEDRAL_CCW", "CHI_OTHER"])
        atom_feature_vector += chirality_type_enc
    
    if hydrogens_implicit == True:
        n_hydrogens_enc = one_hot_encoding(int(atom.GetTotalNumHs()), [0, 1, 2, 3, 4, "MoreThanFour"])
        atom_feature_vector += n_hydrogens_enc
    return np.array(atom_feature_vector)

### Step 2: Bond Featurisation

Now that a constructed function to conveniently turn RDKit atom objects into feature vectors, we define an analogous function for RDKit bond objects:

In [119]:
def get_bond_features(bond, 
                      use_stereochemistry = True):
    """
    Takes an RDKit bond object as input and gives a 1d-numpy array of bond features as output.
    """

    permitted_list_of_bond_types = [Chem.rdchem.BondType.SINGLE, Chem.rdchem.BondType.DOUBLE, Chem.rdchem.BondType.TRIPLE,
                                    Chem.rdchem.BondType.AROMATIC]

    bond_type_enc = one_hot_encoding(bond.GetBondType(), permitted_list_of_bond_types)
    
    bond_is_conj_enc = [int(bond.GetIsConjugated())]
    
    bond_is_in_ring_enc = [int(bond.IsInRing())]
    
    bond_feature_vector = bond_type_enc + bond_is_conj_enc + bond_is_in_ring_enc
    
    if use_stereochemistry == True:
        stereo_type_enc = one_hot_encoding(str(bond.GetStereo()), ["STEREOZ", "STEREOE", "STEREOANY", "STEREONONE"])
        bond_feature_vector += stereo_type_enc

    return np.array(bond_feature_vector)

The bond features we consider in the above function are: bond type, whether the bond is conjugated, and whether the bond is in a ring. As an additional option, the user can specify whether to include E-Z stereochemical features around double bonds.

###  Features for compound 1 (i.e., Drugs)

In [120]:
# Mordred descriptors

featurizer = dc.feat.MordredDescriptors(ignore_3D=True)
mordred_drugs_features = featurizer.featurize(covid_drugs_data_updated['canonical_smiles'].to_list())
mordred_drugs_features

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


array([[ 12.96328069,  12.24825346,   0.        , ..., 105.        ,
          7.02777778,   3.80555556],
       [ 12.96328069,  12.24825346,   0.        , ..., 105.        ,
          7.02777778,   3.80555556],
       [ 12.96328069,  12.24825346,   0.        , ..., 105.        ,
          7.02777778,   3.80555556],
       ...,
       [ 43.99102212,  32.29549569,   0.        , ..., 328.        ,
         24.94444444,  12.77777778],
       [ 43.99102212,  32.29549569,   0.        , ..., 328.        ,
         24.94444444,  12.77777778],
       [ 20.31159544,  15.73111702,   0.        , ..., 157.        ,
          9.09027778,   5.58333333]])

In [121]:
len(mordred_drugs_features)

740

In [122]:
type(mordred_drugs_features)

numpy.ndarray

In [123]:
mordred_drugs_features.shape

(740, 1613)

###  Features for compound 2 (i.e., Target)

In [124]:
# Mordred descriptors

featurizer_target = dc.feat.MordredDescriptors(ignore_3D=True)
target_mordred_features = featurizer_target.featurize(covid_drugs_data_updated['target_organism_isomeric_smiles'].to_list())
target_mordred_features

array([[ 26.72384613,  22.77593513,   1.        , ..., 197.        ,
         15.375     ,   7.5       ],
       [ 26.72384613,  22.77593513,   1.        , ..., 197.        ,
         15.375     ,   7.5       ],
       [ 26.72384613,  22.77593513,   1.        , ..., 197.        ,
         15.375     ,   7.5       ],
       ...,
       [ 26.72384613,  22.77593513,   1.        , ..., 197.        ,
         15.375     ,   7.5       ],
       [ 26.72384613,  22.77593513,   1.        , ..., 197.        ,
         15.375     ,   7.5       ],
       [ 26.72384613,  22.77593513,   1.        , ..., 197.        ,
         15.375     ,   7.5       ]])

In [125]:
len(target_mordred_features)

740

In [126]:
type(target_mordred_features)

numpy.ndarray

In [127]:
target_mordred_features.shape

(740, 1613)

In [128]:
featurizer_target

MordredDescriptors[ignore_3D=True]

### Concatenate and standardize all mordred features for drugs and covid targets

In [129]:
mordred_final_features = np.concatenate((mordred_drugs_features, target_mordred_features), axis=1)
mordred_final_features

array([[ 12.96328069,  12.24825346,   0.        , ..., 197.        ,
         15.375     ,   7.5       ],
       [ 12.96328069,  12.24825346,   0.        , ..., 197.        ,
         15.375     ,   7.5       ],
       [ 12.96328069,  12.24825346,   0.        , ..., 197.        ,
         15.375     ,   7.5       ],
       ...,
       [ 43.99102212,  32.29549569,   0.        , ..., 197.        ,
         15.375     ,   7.5       ],
       [ 43.99102212,  32.29549569,   0.        , ..., 197.        ,
         15.375     ,   7.5       ],
       [ 20.31159544,  15.73111702,   0.        , ..., 197.        ,
         15.375     ,   7.5       ]])

In [130]:
mordred_final_features.shape

(740, 3226)

In [131]:
type(mordred_final_features)

numpy.ndarray

#### Standardize mordred features for drugs and covid targets

In [132]:
mordred_final_features_standardized = scaler.fit_transform(mordred_final_features)
mordred_final_features_standardized

array([[-1.0328746 , -0.84438913, -0.40871852, ...,  0.        ,
         0.        ,  0.        ],
       [-1.0328746 , -0.84438913, -0.40871852, ...,  0.        ,
         0.        ,  0.        ],
       [-1.0328746 , -0.84438913, -0.40871852, ...,  0.        ,
         0.        ,  0.        ],
       ...,
       [ 2.59397292,  2.3357297 , -0.40871852, ...,  0.        ,
         0.        ,  0.        ],
       [ 2.59397292,  2.3357297 , -0.40871852, ...,  0.        ,
         0.        ,  0.        ],
       [-0.17392658, -0.29189817, -0.40871852, ...,  0.        ,
         0.        ,  0.        ]])

In [133]:
mordred_final_features_standardized.shape

(740, 3226)

In [134]:
type(mordred_final_features_standardized)

numpy.ndarray

### The labels of the drugs with their respective organism target.

In [135]:
covid_label = covid_drugs_data_updated.iloc[:, 10]
covid_label

0      0
1      0
2      0
3      0
4      0
      ..
735    0
736    0
737    0
738    0
739    0
Name: bioactivity_class_labels (0-inactive, 1-active, 2-intermediate), Length: 740, dtype: int64

In [136]:
type(covid_label)

pandas.core.series.Series

In [137]:
print("The number of zeros ('0') targets in nv_data is: ", sum(covid_label == 0))

The number of zeros ('0') targets in nv_data is:  349


In [138]:
print("The number of ones ('1') targets in nv_data is: ", sum(covid_label == 1))

The number of ones ('1') targets in nv_data is:  391


### Calculation: Mordred, RDkit, Deepchem features and standardization

#### Mordred, Deepchem features and standardization

In [139]:
def create_pytorch_geometric_graph_data_list_from_smiles_and_labels_comp_api(mordred_drugs_features, x_smiles, y):
    """
    Inputs:
    
    x_smiles = [smiles_1, smiles_2, ....] ... a list of SMILES strings
    y = [y_1, y_2, ...] ... a list of numerial labels for the SMILES strings (such as associated pKi values)
    
    Outputs:
    
    data_list = [G_1, G_2, ...] ... a list of torch_geometric.data.Data objects which represent labeled molecular graphs 
    that can readily be used for machine learning
    
    """
    
    data_list = []
#     X_s = []
    counter = 0
    data = []
    print(len(x_smiles))
    print(len(y))
    print("mordred_des: ",len(mordred_drugs_features))
    
    for (smile, y_val) in zip(x_smiles, y):
#         print("=============================================")
#         print(smile, " label:", y_val)
#         print("=============================================")
        # convert SMILES to RDKit mol object
        mol = Chem.MolFromSmiles(smile)
        
#         for mol in mols:
        # get feature dimensions
        n_nodes = mol.GetNumAtoms() 
        n_edges = 2*mol.GetNumBonds()
        unrelated_smiles = "O=O"
        unrelated_mol = Chem.MolFromSmiles(unrelated_smiles)
        n_node_features = len(get_atom_features(unrelated_mol.GetAtomWithIdx(0)))
        n_edge_features = len(get_bond_features(unrelated_mol.GetBondBetweenAtoms(0,1)))

        # construct node feature matrix X of shape (n_nodes, n_node_features)
        X = np.zeros((1, n_node_features+mordred_drugs_features.shape[1])) # n_node_features
        
#         print(X.shape)
        loop_counter = 0
        for atom in mol.GetAtoms():
#             loop_counter += 1
            X = np.hstack((get_atom_features(atom), mordred_drugs_features[counter]))
        data.append(X)
#             print(atom.GetIdx())
#             print(np.hstack((get_atom_features(atom), mordred_des[counter])).shape)
        counter += 1
        print("counter: ",counter)
        print("loop_counter: ",loop_counter)
        print("****************************************************")
    print(counter)
    
    
#         print(X.shape)
#         break
#        X = torch.tensor(X, dtype = torch.float)
#        X_s.append(X)
    return data    

#### Drugs compounds smiles

In [140]:
drugs_comp_smiles = covid_drugs_data_updated['canonical_smiles']
drugs_comp_smiles

0          NC(=O)c1ncn([C@@H]2C=C(CO)[C@@H](O)[C@H]2O)n1
1          NC(=O)c1cn([C@@H]2C=C(CO)[C@@H](O)[C@H]2O)nn1
2          NC(=O)c1cn([C@@H]2C=C(CO)[C@@H](O)[C@H]2O)cn1
3      Nc1nc(O)c2[nH]cc([C@@H]3C=C(CO)[C@@H](O)[C@H]3...
4      O=c1[nH]cc([C@@H]2C=C(CO)[C@@H](O)[C@H]2O)c(=O...
                             ...                        
735    CC(C)CC1C(=O)NC(C(=O)NC(CC2=CC(=NC=C2)C3=NC(CS...
736    CC(C)C[C@H]1C(=O)N[C@H](C(=O)N[C@@H](CC2=CC(=N...
737    CC1C(=O)NC(C(=O)NC(C(=O)NC(C(=O)NC(C(=O)NC(CC2...
738    C[C@H]1C(=O)N[C@H](C(=O)N[C@H](C(=O)N[C@H](C(=...
739    CN1C=C(C2=CC=CC=C21)SCC(=O)NCC3=CC=C(C=C3)C(F)...
Name: canonical_smiles, Length: 740, dtype: object

In [141]:
type(drugs_comp_smiles)

pandas.core.series.Series

In [142]:
len(drugs_comp_smiles)

740

In [143]:
drugs_comp_smiles.shape

(740,)

### Calculation of atom, bond and molecules features for Drugs

In [144]:
data_list_api = create_pytorch_geometric_graph_data_list_from_smiles_and_labels_comp_api(mordred_drugs_features, drugs_comp_smiles, covid_label)
data_list_api

740
740
mordred_des:  740
counter:  1
loop_counter:  0
****************************************************
counter:  2
loop_counter:  0
****************************************************
counter:  3
loop_counter:  0
****************************************************
counter:  4
loop_counter:  0
****************************************************
counter:  5
loop_counter:  0
****************************************************
counter:  6
loop_counter:  0
****************************************************
counter:  7
loop_counter:  0
****************************************************
counter:  8
loop_counter:  0
****************************************************
counter:  9
loop_counter:  0
****************************************************
counter:  10
loop_counter:  0
****************************************************
counter:  11
loop_counter:  0
****************************************************
counter:  12
loop_counter:  0
****************************************

counter:  203
loop_counter:  0
****************************************************
counter:  204
loop_counter:  0
****************************************************
counter:  205
loop_counter:  0
****************************************************
counter:  206
loop_counter:  0
****************************************************
counter:  207
loop_counter:  0
****************************************************
counter:  208
loop_counter:  0
****************************************************
counter:  209
loop_counter:  0
****************************************************
counter:  210
loop_counter:  0
****************************************************
counter:  211
loop_counter:  0
****************************************************
counter:  212
loop_counter:  0
****************************************************
counter:  213
loop_counter:  0
****************************************************
counter:  214
loop_counter:  0
*********************************************

counter:  446
loop_counter:  0
****************************************************
counter:  447
loop_counter:  0
****************************************************
counter:  448
loop_counter:  0
****************************************************
counter:  449
loop_counter:  0
****************************************************
counter:  450
loop_counter:  0
****************************************************
counter:  451
loop_counter:  0
****************************************************
counter:  452
loop_counter:  0
****************************************************
counter:  453
loop_counter:  0
****************************************************
counter:  454
loop_counter:  0
****************************************************
counter:  455
loop_counter:  0
****************************************************
counter:  456
loop_counter:  0
****************************************************
counter:  457
loop_counter:  0
*********************************************

counter:  668
loop_counter:  0
****************************************************
counter:  669
loop_counter:  0
****************************************************
counter:  670
loop_counter:  0
****************************************************
counter:  671
loop_counter:  0
****************************************************
counter:  672
loop_counter:  0
****************************************************
counter:  673
loop_counter:  0
****************************************************
counter:  674
loop_counter:  0
****************************************************
counter:  675
loop_counter:  0
****************************************************
counter:  676
loop_counter:  0
****************************************************
counter:  677
loop_counter:  0
****************************************************
counter:  678
loop_counter:  0
****************************************************
counter:  679
loop_counter:  0
*********************************************



[array([  0.        ,   1.        ,   0.        , ..., 105.        ,
          7.02777778,   3.80555556]),
 array([  0.        ,   1.        ,   0.        , ..., 105.        ,
          7.02777778,   3.80555556]),
 array([  0.        ,   1.        ,   0.        , ..., 105.        ,
          7.02777778,   3.80555556]),
 array([  0.        ,   1.        ,   0.        , ..., 135.        ,
          7.5       ,   4.36111111]),
 array([  0.        ,   1.        ,   0.        , ..., 105.        ,
          7.02777778,   3.80555556]),
 array([  0.        ,   1.        ,   0.        , ..., 105.        ,
          7.02777778,   3.80555556]),
 array([  0.        ,   0.        ,   1.        , ..., 128.        ,
          6.63888889,   4.19444444]),
 array([  0.        ,   0.        ,   1.        , ..., 114.        ,
          7.88888889,   4.02777778]),
 array([  0.        ,   1.        ,   0.        , ..., 105.        ,
          7.02777778,   3.80555556]),
 array([  1.        ,   0.        ,  

In [145]:
len(data_list_api)

740

In [146]:
data_list_api

[array([  0.        ,   1.        ,   0.        , ..., 105.        ,
          7.02777778,   3.80555556]),
 array([  0.        ,   1.        ,   0.        , ..., 105.        ,
          7.02777778,   3.80555556]),
 array([  0.        ,   1.        ,   0.        , ..., 105.        ,
          7.02777778,   3.80555556]),
 array([  0.        ,   1.        ,   0.        , ..., 135.        ,
          7.5       ,   4.36111111]),
 array([  0.        ,   1.        ,   0.        , ..., 105.        ,
          7.02777778,   3.80555556]),
 array([  0.        ,   1.        ,   0.        , ..., 105.        ,
          7.02777778,   3.80555556]),
 array([  0.        ,   0.        ,   1.        , ..., 128.        ,
          6.63888889,   4.19444444]),
 array([  0.        ,   0.        ,   1.        , ..., 114.        ,
          7.88888889,   4.02777778]),
 array([  0.        ,   1.        ,   0.        , ..., 105.        ,
          7.02777778,   3.80555556]),
 array([  1.        ,   0.        ,  

In [147]:
type(data_list_api)

list

In [148]:
api_data_list = np.array(data_list_api)

In [149]:
type(api_data_list)

numpy.ndarray

In [150]:
api_data_list

array([[  0.        ,   1.        ,   0.        , ..., 105.        ,
          7.02777778,   3.80555556],
       [  0.        ,   1.        ,   0.        , ..., 105.        ,
          7.02777778,   3.80555556],
       [  0.        ,   1.        ,   0.        , ..., 105.        ,
          7.02777778,   3.80555556],
       ...,
       [  1.        ,   0.        ,   0.        , ..., 328.        ,
         24.94444444,  12.77777778],
       [  1.        ,   0.        ,   0.        , ..., 328.        ,
         24.94444444,  12.77777778],
       [  0.        ,   0.        ,   0.        , ..., 157.        ,
          9.09027778,   5.58333333]])

In [151]:
api_data_list.shape

(740, 1692)

### Targets compounds smiles

In [152]:
targets_comp_smiles = covid_drugs_data_updated['target_organism_isomeric_smiles']
targets_comp_smiles

0      CC(C)C[C@H](C(=O)N[C@H](CC1CCNC1=O)C(O)S(=O)(=...
1      CC(C)C[C@H](C(=O)N[C@H](CC1CCNC1=O)C(O)S(=O)(=...
2      CC(C)C[C@H](C(=O)N[C@H](CC1CCNC1=O)C(O)S(=O)(=...
3      CC(C)C[C@H](C(=O)N[C@H](CC1CCNC1=O)C(O)S(=O)(=...
4      CC(C)C[C@H](C(=O)N[C@H](CC1CCNC1=O)C(O)S(=O)(=...
                             ...                        
735    CC(C)C[C@H](C(=O)N[C@H](CC1CCNC1=O)C(O)S(=O)(=...
736    CC(C)C[C@H](C(=O)N[C@H](CC1CCNC1=O)C(O)S(=O)(=...
737    CC(C)C[C@H](C(=O)N[C@H](CC1CCNC1=O)C(O)S(=O)(=...
738    CC(C)C[C@H](C(=O)N[C@H](CC1CCNC1=O)C(O)S(=O)(=...
739    CC(C)C[C@H](C(=O)N[C@H](CC1CCNC1=O)C(O)S(=O)(=...
Name: target_organism_isomeric_smiles, Length: 740, dtype: object

In [153]:
type(targets_comp_smiles)

pandas.core.series.Series

In [154]:
len(targets_comp_smiles)

740

In [155]:
targets_comp_smiles.shape

(740,)

### Node features for compound 2 (i.e., Targets)

In [156]:
target_mordred_features

array([[ 26.72384613,  22.77593513,   1.        , ..., 197.        ,
         15.375     ,   7.5       ],
       [ 26.72384613,  22.77593513,   1.        , ..., 197.        ,
         15.375     ,   7.5       ],
       [ 26.72384613,  22.77593513,   1.        , ..., 197.        ,
         15.375     ,   7.5       ],
       ...,
       [ 26.72384613,  22.77593513,   1.        , ..., 197.        ,
         15.375     ,   7.5       ],
       [ 26.72384613,  22.77593513,   1.        , ..., 197.        ,
         15.375     ,   7.5       ],
       [ 26.72384613,  22.77593513,   1.        , ..., 197.        ,
         15.375     ,   7.5       ]])

In [157]:
len(target_mordred_features)

740

In [158]:
type(target_mordred_features)

numpy.ndarray

In [159]:
target_mordred_features.shape

(740, 1613)

In [160]:
data_list_targets = create_pytorch_geometric_graph_data_list_from_smiles_and_labels_comp_api(target_mordred_features, targets_comp_smiles, covid_label)
data_list_targets

740
740
mordred_des:  740
counter:  1
loop_counter:  0
****************************************************
counter:  2
loop_counter:  0
****************************************************
counter:  3
loop_counter:  0
****************************************************
counter:  4
loop_counter:  0
****************************************************
counter:  5
loop_counter:  0
****************************************************
counter:  6
loop_counter:  0
****************************************************
counter:  7
loop_counter:  0
****************************************************
counter:  8
loop_counter:  0
****************************************************
counter:  9
loop_counter:  0
****************************************************
counter:  10
loop_counter:  0
****************************************************
counter:  11
loop_counter:  0
****************************************************
counter:  12
loop_counter:  0
****************************************

counter:  198
loop_counter:  0
****************************************************
counter:  199
loop_counter:  0
****************************************************
counter:  200
loop_counter:  0
****************************************************
counter:  201
loop_counter:  0
****************************************************
counter:  202
loop_counter:  0
****************************************************
counter:  203
loop_counter:  0
****************************************************
counter:  204
loop_counter:  0
****************************************************
counter:  205
loop_counter:  0
****************************************************
counter:  206
loop_counter:  0
****************************************************
counter:  207
loop_counter:  0
****************************************************
counter:  208
loop_counter:  0
****************************************************
counter:  209
loop_counter:  0
*********************************************

counter:  403
loop_counter:  0
****************************************************
counter:  404
loop_counter:  0
****************************************************
counter:  405
loop_counter:  0
****************************************************
counter:  406
loop_counter:  0
****************************************************
counter:  407
loop_counter:  0
****************************************************
counter:  408
loop_counter:  0
****************************************************
counter:  409
loop_counter:  0
****************************************************
counter:  410
loop_counter:  0
****************************************************
counter:  411
loop_counter:  0
****************************************************
counter:  412
loop_counter:  0
****************************************************
counter:  413
loop_counter:  0
****************************************************
counter:  414
loop_counter:  0
*********************************************

counter:  610
loop_counter:  0
****************************************************
counter:  611
loop_counter:  0
****************************************************
counter:  612
loop_counter:  0
****************************************************
counter:  613
loop_counter:  0
****************************************************
counter:  614
loop_counter:  0
****************************************************
counter:  615
loop_counter:  0
****************************************************
counter:  616
loop_counter:  0
****************************************************
counter:  617
loop_counter:  0
****************************************************
counter:  618
loop_counter:  0
****************************************************
counter:  619
loop_counter:  0
****************************************************
counter:  620
loop_counter:  0
****************************************************
counter:  621
loop_counter:  0
*********************************************

[array([  0.   ,   0.   ,   0.   , ..., 197.   ,  15.375,   7.5  ]),
 array([  0.   ,   0.   ,   0.   , ..., 197.   ,  15.375,   7.5  ]),
 array([  0.   ,   0.   ,   0.   , ..., 197.   ,  15.375,   7.5  ]),
 array([  0.   ,   0.   ,   0.   , ..., 197.   ,  15.375,   7.5  ]),
 array([  0.   ,   0.   ,   0.   , ..., 197.   ,  15.375,   7.5  ]),
 array([  0.   ,   0.   ,   0.   , ..., 197.   ,  15.375,   7.5  ]),
 array([  0.   ,   0.   ,   0.   , ..., 197.   ,  15.375,   7.5  ]),
 array([  0.   ,   0.   ,   0.   , ..., 197.   ,  15.375,   7.5  ]),
 array([  0.   ,   0.   ,   0.   , ..., 197.   ,  15.375,   7.5  ]),
 array([  0.   ,   0.   ,   0.   , ..., 197.   ,  15.375,   7.5  ]),
 array([  0.   ,   0.   ,   0.   , ..., 197.   ,  15.375,   7.5  ]),
 array([  0.   ,   0.   ,   0.   , ..., 197.   ,  15.375,   7.5  ]),
 array([  0.   ,   0.   ,   0.   , ..., 197.   ,  15.375,   7.5  ]),
 array([  0.   ,   0.   ,   0.   , ..., 197.   ,  15.375,   7.5  ]),
 array([  0.   ,   0.   ,   0.   ,

In [161]:
len(data_list_targets)

740

In [162]:
data_list_targets

[array([  0.   ,   0.   ,   0.   , ..., 197.   ,  15.375,   7.5  ]),
 array([  0.   ,   0.   ,   0.   , ..., 197.   ,  15.375,   7.5  ]),
 array([  0.   ,   0.   ,   0.   , ..., 197.   ,  15.375,   7.5  ]),
 array([  0.   ,   0.   ,   0.   , ..., 197.   ,  15.375,   7.5  ]),
 array([  0.   ,   0.   ,   0.   , ..., 197.   ,  15.375,   7.5  ]),
 array([  0.   ,   0.   ,   0.   , ..., 197.   ,  15.375,   7.5  ]),
 array([  0.   ,   0.   ,   0.   , ..., 197.   ,  15.375,   7.5  ]),
 array([  0.   ,   0.   ,   0.   , ..., 197.   ,  15.375,   7.5  ]),
 array([  0.   ,   0.   ,   0.   , ..., 197.   ,  15.375,   7.5  ]),
 array([  0.   ,   0.   ,   0.   , ..., 197.   ,  15.375,   7.5  ]),
 array([  0.   ,   0.   ,   0.   , ..., 197.   ,  15.375,   7.5  ]),
 array([  0.   ,   0.   ,   0.   , ..., 197.   ,  15.375,   7.5  ]),
 array([  0.   ,   0.   ,   0.   , ..., 197.   ,  15.375,   7.5  ]),
 array([  0.   ,   0.   ,   0.   , ..., 197.   ,  15.375,   7.5  ]),
 array([  0.   ,   0.   ,   0.   ,

In [163]:
type(data_list_targets)

list

In [164]:
targets_data_list = np.array(data_list_targets)
targets_data_list

array([[  0.   ,   0.   ,   0.   , ..., 197.   ,  15.375,   7.5  ],
       [  0.   ,   0.   ,   0.   , ..., 197.   ,  15.375,   7.5  ],
       [  0.   ,   0.   ,   0.   , ..., 197.   ,  15.375,   7.5  ],
       ...,
       [  0.   ,   0.   ,   0.   , ..., 197.   ,  15.375,   7.5  ],
       [  0.   ,   0.   ,   0.   , ..., 197.   ,  15.375,   7.5  ],
       [  0.   ,   0.   ,   0.   , ..., 197.   ,  15.375,   7.5  ]])

In [165]:
type(targets_data_list)

numpy.ndarray

In [166]:
targets_data_list

array([[  0.   ,   0.   ,   0.   , ..., 197.   ,  15.375,   7.5  ],
       [  0.   ,   0.   ,   0.   , ..., 197.   ,  15.375,   7.5  ],
       [  0.   ,   0.   ,   0.   , ..., 197.   ,  15.375,   7.5  ],
       ...,
       [  0.   ,   0.   ,   0.   , ..., 197.   ,  15.375,   7.5  ],
       [  0.   ,   0.   ,   0.   , ..., 197.   ,  15.375,   7.5  ],
       [  0.   ,   0.   ,   0.   , ..., 197.   ,  15.375,   7.5  ]])

In [167]:
targets_data_list.shape

(740, 1692)

### Rule of five drug features

In [168]:
drugs_rule_of_five_features = covid_drugs_data_updated[['MW', 'LogP', 'NumHDonors', 'NumHAcceptors']]
drugs_rule_of_five_features

Unnamed: 0,MW,LogP,NumHDonors,NumHAcceptors
0,240.219,-2.4278,4,7
1,240.219,-2.4278,4,7
2,239.231,-1.8228,4,6
3,278.268,-1.0166,6,7
4,240.215,-2.1991,5,5
...,...,...,...,...
735,715.878,-0.9674,7,10
736,715.878,-0.9674,7,10
737,829.982,-2.5941,9,12
738,829.982,-2.5941,9,12


In [169]:
type(drugs_rule_of_five_features)

pandas.core.frame.DataFrame

In [170]:
drugs_rule_of_five_features.shape

(740, 4)

In [171]:
drugs_rule_of_five_features = np.array(drugs_rule_of_five_features)
drugs_rule_of_five_features

array([[240.219 ,  -2.4278,   4.    ,   7.    ],
       [240.219 ,  -2.4278,   4.    ,   7.    ],
       [239.231 ,  -1.8228,   4.    ,   6.    ],
       ...,
       [829.982 ,  -2.5941,   9.    ,  12.    ],
       [829.982 ,  -2.5941,   9.    ,  12.    ],
       [378.419 ,   4.6056,   1.    ,   3.    ]])

In [172]:
type(drugs_rule_of_five_features)

numpy.ndarray

In [173]:
drugs_rule_of_five_features.shape

(740, 4)

In [174]:
drugs_rule_of_five_features_standardized = scaler.fit_transform(drugs_rule_of_five_features)
drugs_rule_of_five_features_standardized

array([[-1.11269182, -2.21139955,  0.49495822,  0.32818076],
       [-1.11269182, -2.21139955,  0.49495822,  0.32818076],
       [-1.11913368, -1.94885674,  0.49495822,  0.02077093],
       ...,
       [ 2.73262258, -2.28356628,  2.34107255,  1.86522987],
       [ 2.73262258, -2.28356628,  2.34107255,  1.86522987],
       [-0.21161385,  0.84077996, -0.61271038, -0.90145853]])

### RDKit drugs and covid targets data list

#### Rdkit drug features

In [175]:
rdkit_drug_features = api_data_list[:, 0:79]
rdkit_drug_features

array([[0., 1., 0., ..., 0., 0., 0.],
       [0., 1., 0., ..., 0., 0., 0.],
       [0., 1., 0., ..., 0., 0., 0.],
       ...,
       [1., 0., 0., ..., 1., 0., 0.],
       [1., 0., 0., ..., 1., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [176]:
type(rdkit_drug_features)

numpy.ndarray

In [177]:
rdkit_drug_features.shape

(740, 79)

#### Rdkit targets features

In [178]:
rdkit_targets_features = targets_data_list[:, 0:79]
rdkit_targets_features

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [179]:
type(rdkit_targets_features)

numpy.ndarray

In [180]:
rdkit_targets_features.shape

(740, 79)

#### Concatenate drugs and covid targets data list and standardize

In [181]:
rdkit_final_features = np.concatenate((rdkit_drug_features, rdkit_targets_features), axis=1)
rdkit_final_features

array([[0., 1., 0., ..., 0., 0., 0.],
       [0., 1., 0., ..., 0., 0., 0.],
       [0., 1., 0., ..., 0., 0., 0.],
       ...,
       [1., 0., 0., ..., 0., 0., 0.],
       [1., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [182]:
type(rdkit_final_features)

numpy.ndarray

In [183]:
rdkit_final_features.shape

(740, 158)

In [184]:
# Standardize the concatenated features

rdkit_final_features_standardized = scaler.fit_transform(rdkit_final_features)
rdkit_final_features_standardized

array([[-1.03577465,  3.14362099, -0.57735027, ...,  0.        ,
         0.        ,  0.        ],
       [-1.03577465,  3.14362099, -0.57735027, ...,  0.        ,
         0.        ,  0.        ],
       [-1.03577465,  3.14362099, -0.57735027, ...,  0.        ,
         0.        ,  0.        ],
       ...,
       [ 0.96546097, -0.31810451, -0.57735027, ...,  0.        ,
         0.        ,  0.        ],
       [ 0.96546097, -0.31810451, -0.57735027, ...,  0.        ,
         0.        ,  0.        ],
       [-1.03577465, -0.31810451, -0.57735027, ...,  0.        ,
         0.        ,  0.        ]])

In [185]:
type(rdkit_final_features_standardized)

numpy.ndarray

In [186]:
rdkit_final_features_standardized.shape

(740, 158)

### Concatenated and standardize RDkit and Mordred features

In [187]:
rdkit_mordred_final_features = np.concatenate((api_data_list, targets_data_list), axis=1)
rdkit_mordred_final_features

array([[  0.   ,   1.   ,   0.   , ..., 197.   ,  15.375,   7.5  ],
       [  0.   ,   1.   ,   0.   , ..., 197.   ,  15.375,   7.5  ],
       [  0.   ,   1.   ,   0.   , ..., 197.   ,  15.375,   7.5  ],
       ...,
       [  1.   ,   0.   ,   0.   , ..., 197.   ,  15.375,   7.5  ],
       [  1.   ,   0.   ,   0.   , ..., 197.   ,  15.375,   7.5  ],
       [  0.   ,   0.   ,   0.   , ..., 197.   ,  15.375,   7.5  ]])

In [188]:
type(rdkit_mordred_final_features)

numpy.ndarray

In [189]:
rdkit_mordred_final_features.shape

(740, 3384)

In [190]:
# Standardize rdkit and mordred features

rdkit_mordred_final_features_standardizied = scaler.fit_transform(rdkit_mordred_final_features)
rdkit_mordred_final_features_standardizied

array([[-1.03577465,  3.14362099, -0.57735027, ...,  0.        ,
         0.        ,  0.        ],
       [-1.03577465,  3.14362099, -0.57735027, ...,  0.        ,
         0.        ,  0.        ],
       [-1.03577465,  3.14362099, -0.57735027, ...,  0.        ,
         0.        ,  0.        ],
       ...,
       [ 0.96546097, -0.31810451, -0.57735027, ...,  0.        ,
         0.        ,  0.        ],
       [ 0.96546097, -0.31810451, -0.57735027, ...,  0.        ,
         0.        ,  0.        ],
       [-1.03577465, -0.31810451, -0.57735027, ...,  0.        ,
         0.        ,  0.        ]])

In [191]:
type(rdkit_mordred_final_features_standardizied)

numpy.ndarray

In [192]:
rdkit_mordred_final_features_standardizied.shape

(740, 3384)

### RDKit and Rule of 5 features

In [193]:
rdkit_rule_of_five_features_standardized = np.concatenate((rdkit_final_features_standardized, drugs_rule_of_five_features_standardized), axis=1)
rdkit_rule_of_five_features_standardized

array([[-1.03577465,  3.14362099, -0.57735027, ..., -2.21139955,
         0.49495822,  0.32818076],
       [-1.03577465,  3.14362099, -0.57735027, ..., -2.21139955,
         0.49495822,  0.32818076],
       [-1.03577465,  3.14362099, -0.57735027, ..., -1.94885674,
         0.49495822,  0.02077093],
       ...,
       [ 0.96546097, -0.31810451, -0.57735027, ..., -2.28356628,
         2.34107255,  1.86522987],
       [ 0.96546097, -0.31810451, -0.57735027, ..., -2.28356628,
         2.34107255,  1.86522987],
       [-1.03577465, -0.31810451, -0.57735027, ...,  0.84077996,
        -0.61271038, -0.90145853]])

In [194]:
type(rdkit_rule_of_five_features_standardized)

numpy.ndarray

In [195]:
rdkit_rule_of_five_features_standardized.shape

(740, 162)

### TF-IDF, RDKit and Rule of 5 features

In [196]:
tfidf_final_features_updated = np.concatenate((drug_smiles_vect_features, isomeric_target_smiles_vect_features), axis=1)
tfidf_final_features_updated

array([[0.09729017, 0.        , 0.19563355, ..., 0.10721125, 0.28589668,
        0.03573708],
       [0.09729017, 0.        , 0.19563355, ..., 0.10721125, 0.28589668,
        0.03573708],
       [0.0963129 , 0.        , 0.19366844, ..., 0.10721125, 0.28589668,
        0.03573708],
       ...,
       [0.03922663, 0.        , 0.37467002, ..., 0.10721125, 0.28589668,
        0.03573708],
       [0.03189533, 0.        , 0.30464571, ..., 0.10721125, 0.28589668,
        0.03573708],
       [0.08083054, 0.        , 0.20317015, ..., 0.10721125, 0.28589668,
        0.03573708]])

In [197]:
type(tfidf_final_features_updated)

numpy.ndarray

In [198]:
tfidf_final_features_updated.shape

(740, 52)

In [199]:
tfidf_final_features_standardized_updated = scaler.fit_transform(tfidf_final_features_updated)
tfidf_final_features_standardized_updated

array([[ 3.04596307e-01, -2.31128006e-01, -1.93991294e-01, ...,
        -1.88737914e-15,  3.16413562e-15,  3.95516953e-16],
       [ 3.04596307e-01, -2.31128006e-01, -1.93991294e-01, ...,
        -1.88737914e-15,  3.16413562e-15,  3.95516953e-16],
       [ 2.76518587e-01, -2.31128006e-01, -2.21037978e-01, ...,
        -1.88737914e-15,  3.16413562e-15,  3.95516953e-16],
       ...,
       [-1.36362035e+00, -2.31128006e-01,  2.27016487e+00, ...,
        -1.88737914e-15,  3.16413562e-15,  3.95516953e-16],
       [-1.57425498e+00, -2.31128006e-01,  1.30639003e+00, ...,
        -1.88737914e-15,  3.16413562e-15,  3.95516953e-16],
       [-1.68303217e-01, -2.31128006e-01, -9.02619004e-02, ...,
        -1.88737914e-15,  3.16413562e-15,  3.95516953e-16]])

In [200]:
type(tfidf_final_features_standardized_updated)

numpy.ndarray

In [201]:
tfidf_final_features_standardized_updated.shape

(740, 52)

In [202]:
# Concatenate tfidf, rdkit, and rule of five features

tfidf_rdkit_rule_of_five_features_standardized = np.concatenate((tfidf_final_features_standardized_updated, rdkit_rule_of_five_features_standardized), axis=1)
tfidf_rdkit_rule_of_five_features_standardized

array([[ 0.30459631, -0.23112801, -0.19399129, ..., -2.21139955,
         0.49495822,  0.32818076],
       [ 0.30459631, -0.23112801, -0.19399129, ..., -2.21139955,
         0.49495822,  0.32818076],
       [ 0.27651859, -0.23112801, -0.22103798, ..., -1.94885674,
         0.49495822,  0.02077093],
       ...,
       [-1.36362035, -0.23112801,  2.27016487, ..., -2.28356628,
         2.34107255,  1.86522987],
       [-1.57425498, -0.23112801,  1.30639003, ..., -2.28356628,
         2.34107255,  1.86522987],
       [-0.16830322, -0.23112801, -0.0902619 , ...,  0.84077996,
        -0.61271038, -0.90145853]])

In [203]:
type(tfidf_rdkit_rule_of_five_features_standardized)

numpy.ndarray

In [204]:
tfidf_rdkit_rule_of_five_features_standardized.shape

(740, 214)

### TF-IDF, RDKit, and Mordred features

In [205]:
tfidf_rdkit_mordred_final_features_standardized = np.concatenate((tfidf_final_features_standardized_updated, rdkit_mordred_final_features_standardizied ), axis=1)
tfidf_rdkit_mordred_final_features_standardized

array([[ 0.30459631, -0.23112801, -0.19399129, ...,  0.        ,
         0.        ,  0.        ],
       [ 0.30459631, -0.23112801, -0.19399129, ...,  0.        ,
         0.        ,  0.        ],
       [ 0.27651859, -0.23112801, -0.22103798, ...,  0.        ,
         0.        ,  0.        ],
       ...,
       [-1.36362035, -0.23112801,  2.27016487, ...,  0.        ,
         0.        ,  0.        ],
       [-1.57425498, -0.23112801,  1.30639003, ...,  0.        ,
         0.        ,  0.        ],
       [-0.16830322, -0.23112801, -0.0902619 , ...,  0.        ,
         0.        ,  0.        ]])

In [206]:
type(tfidf_rdkit_mordred_final_features_standardized)

numpy.ndarray

In [207]:
tfidf_rdkit_mordred_final_features_standardized.shape

(740, 3436)

## Create Covid-19 Graph Data with TF-IDF, RDKit, Mordred and Rule of 5 Features

#### drugs and targets node labels

In [208]:
covid_label.values

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0,
       0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1,
       0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

In [209]:
type(covid_label.values)

numpy.ndarray

In [210]:
(covid_label.values).shape

(740,)

In [211]:
y = torch.tensor(covid_label.values)
y

tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0,
        1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1,
        1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1,
        1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0,

In [212]:
type(y)

torch.Tensor

In [213]:
len(y)

740

In [214]:
y.shape

torch.Size([740])

#### drugs and targets edge index and type

In [215]:
# Edge index

edge_index

tensor([[  1,   2,   3,  ..., 614, 614, 614],
        [609, 609, 609,  ..., 606, 607, 608]])

In [216]:
len(edge_index)

2

In [217]:
type(edge_index)

torch.Tensor

In [218]:
edge_index.shape

torch.Size([2, 1480])

In [219]:
# Edge type

edge_type

tensor([0, 0, 0,  ..., 0, 0, 0])

In [220]:
len(edge_type)

1480

In [221]:
type(edge_type)

torch.Tensor

In [222]:
edge_type.shape

torch.Size([1480])

### Create and save final covid-19 graph dataset for gnn models training

###### Graph with both drug_target_corona_virus_organisms_similiarities_graph_data interactions and drugs-target similarity as the edges
###### Tf-idf, Rule of five, RDKit atoms features, RDKit bonds features, and Mordred molecule feature descriptors

### 1. Tf-idf as node features

In [223]:
# Convert numpy array features into tensor format

tfidf_final_features_standardized_updated_tensors = torch.tensor(tfidf_final_features_standardized_updated, dtype=torch.float)
tfidf_final_features_standardized_updated_tensors

tensor([[ 3.0460e-01, -2.3113e-01, -1.9399e-01,  ..., -1.8874e-15,
          3.1641e-15,  3.9552e-16],
        [ 3.0460e-01, -2.3113e-01, -1.9399e-01,  ..., -1.8874e-15,
          3.1641e-15,  3.9552e-16],
        [ 2.7652e-01, -2.3113e-01, -2.2104e-01,  ..., -1.8874e-15,
          3.1641e-15,  3.9552e-16],
        ...,
        [-1.3636e+00, -2.3113e-01,  2.2702e+00,  ..., -1.8874e-15,
          3.1641e-15,  3.9552e-16],
        [-1.5743e+00, -2.3113e-01,  1.3064e+00,  ..., -1.8874e-15,
          3.1641e-15,  3.9552e-16],
        [-1.6830e-01, -2.3113e-01, -9.0262e-02,  ..., -1.8874e-15,
          3.1641e-15,  3.9552e-16]])

In [224]:
type(tfidf_final_features_standardized_updated_tensors)

torch.Tensor

In [225]:
tfidf_final_features_standardized_updated_tensors.shape

torch.Size([740, 52])

In [226]:
from torch_geometric.data import Data

In [227]:
tfidf_graph_data = Data(x=tfidf_final_features_standardized_updated_tensors, edge_index=edge_index, edge_type=edge_type, y=y)
tfidf_graph_data

Data(x=[740, 52], edge_index=[2, 1480], y=[740], edge_type=[1480])

In [228]:
type(tfidf_graph_data)

torch_geometric.data.data.Data

### 2. Mordred as node features

In [229]:
# Convert numpy array features into tensor format

mordred_final_features_standardized_tensors = torch.tensor(mordred_final_features_standardized, dtype=torch.float)
mordred_final_features_standardized_tensors

tensor([[-1.0329, -0.8444, -0.4087,  ...,  0.0000,  0.0000,  0.0000],
        [-1.0329, -0.8444, -0.4087,  ...,  0.0000,  0.0000,  0.0000],
        [-1.0329, -0.8444, -0.4087,  ...,  0.0000,  0.0000,  0.0000],
        ...,
        [ 2.5940,  2.3357, -0.4087,  ...,  0.0000,  0.0000,  0.0000],
        [ 2.5940,  2.3357, -0.4087,  ...,  0.0000,  0.0000,  0.0000],
        [-0.1739, -0.2919, -0.4087,  ...,  0.0000,  0.0000,  0.0000]])

In [230]:
type(mordred_final_features_standardized_tensors)

torch.Tensor

In [231]:
mordred_final_features_standardized_tensors.shape

torch.Size([740, 3226])

In [232]:
mordred_graph_data = Data(x=mordred_final_features_standardized_tensors, edge_index=edge_index, edge_type=edge_type, y=y)
mordred_graph_data

Data(x=[740, 3226], edge_index=[2, 1480], y=[740], edge_type=[1480])

In [233]:
type(mordred_graph_data)

torch_geometric.data.data.Data

### 3. Rdkit as node features

In [234]:
# Convert numpy array features into tensor format

rdkit_final_features_standardized_tensors = torch.tensor(rdkit_final_features_standardized, dtype=torch.float)
rdkit_final_features_standardized_tensors

tensor([[-1.0358,  3.1436, -0.5774,  ...,  0.0000,  0.0000,  0.0000],
        [-1.0358,  3.1436, -0.5774,  ...,  0.0000,  0.0000,  0.0000],
        [-1.0358,  3.1436, -0.5774,  ...,  0.0000,  0.0000,  0.0000],
        ...,
        [ 0.9655, -0.3181, -0.5774,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.9655, -0.3181, -0.5774,  ...,  0.0000,  0.0000,  0.0000],
        [-1.0358, -0.3181, -0.5774,  ...,  0.0000,  0.0000,  0.0000]])

In [235]:
type(rdkit_final_features_standardized_tensors)

torch.Tensor

In [236]:
rdkit_final_features_standardized_tensors.shape

torch.Size([740, 158])

In [237]:
rdkit_graph_data = Data(x=rdkit_final_features_standardized_tensors, edge_index=edge_index, edge_type=edge_type, y=y)
rdkit_graph_data

Data(x=[740, 158], edge_index=[2, 1480], y=[740], edge_type=[1480])

In [238]:
type(rdkit_graph_data)

torch_geometric.data.data.Data

### 4. Rule of five as node features

In [239]:
# Convert numpy array features into tensor format

drugs_rule_of_five_features_standardized_tensors = torch.tensor(drugs_rule_of_five_features_standardized, dtype=torch.float)
drugs_rule_of_five_features_standardized_tensors

tensor([[-1.1127, -2.2114,  0.4950,  0.3282],
        [-1.1127, -2.2114,  0.4950,  0.3282],
        [-1.1191, -1.9489,  0.4950,  0.0208],
        ...,
        [ 2.7326, -2.2836,  2.3411,  1.8652],
        [ 2.7326, -2.2836,  2.3411,  1.8652],
        [-0.2116,  0.8408, -0.6127, -0.9015]])

In [240]:
type(drugs_rule_of_five_features_standardized_tensors)

torch.Tensor

In [241]:
drugs_rule_of_five_features_standardized_tensors.shape

torch.Size([740, 4])

In [242]:
rule_of_five_graph_data = Data(x=drugs_rule_of_five_features_standardized_tensors, edge_index=edge_index, edge_type=edge_type, y=y)
rule_of_five_graph_data

Data(x=[740, 4], edge_index=[2, 1480], y=[740], edge_type=[1480])

In [243]:
type(rule_of_five_graph_data)

torch_geometric.data.data.Data

### 5. Rdkit and mordred as node features

In [244]:
# Convert numpy array features into tensor format

rdkit_mordred_final_features_standardizied_tensors = torch.tensor(rdkit_mordred_final_features_standardizied, dtype=torch.float)
rdkit_mordred_final_features_standardizied_tensors

tensor([[-1.0358,  3.1436, -0.5774,  ...,  0.0000,  0.0000,  0.0000],
        [-1.0358,  3.1436, -0.5774,  ...,  0.0000,  0.0000,  0.0000],
        [-1.0358,  3.1436, -0.5774,  ...,  0.0000,  0.0000,  0.0000],
        ...,
        [ 0.9655, -0.3181, -0.5774,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.9655, -0.3181, -0.5774,  ...,  0.0000,  0.0000,  0.0000],
        [-1.0358, -0.3181, -0.5774,  ...,  0.0000,  0.0000,  0.0000]])

In [245]:
type(rdkit_mordred_final_features_standardizied_tensors)

torch.Tensor

In [246]:
rdkit_mordred_final_features_standardizied_tensors.shape

torch.Size([740, 3384])

In [247]:
rdkit_mordred_graph_data = Data(x=rdkit_mordred_final_features_standardizied_tensors, edge_index=edge_index, edge_type=edge_type, y=y)
rdkit_mordred_graph_data

Data(x=[740, 3384], edge_index=[2, 1480], y=[740], edge_type=[1480])

In [248]:
type(rdkit_mordred_graph_data)

torch_geometric.data.data.Data

### 6. Rdkit and Rule of five as node features

In [249]:
# Convert numpy array features into tensor format

rdkit_rule_of_five_features_standardized_tensors = torch.tensor(rdkit_rule_of_five_features_standardized, dtype=torch.float)
rdkit_rule_of_five_features_standardized_tensors

tensor([[-1.0358,  3.1436, -0.5774,  ..., -2.2114,  0.4950,  0.3282],
        [-1.0358,  3.1436, -0.5774,  ..., -2.2114,  0.4950,  0.3282],
        [-1.0358,  3.1436, -0.5774,  ..., -1.9489,  0.4950,  0.0208],
        ...,
        [ 0.9655, -0.3181, -0.5774,  ..., -2.2836,  2.3411,  1.8652],
        [ 0.9655, -0.3181, -0.5774,  ..., -2.2836,  2.3411,  1.8652],
        [-1.0358, -0.3181, -0.5774,  ...,  0.8408, -0.6127, -0.9015]])

In [250]:
type(rdkit_rule_of_five_features_standardized_tensors)

torch.Tensor

In [251]:
rdkit_rule_of_five_features_standardized_tensors.shape

torch.Size([740, 162])

In [252]:
rdkit_rule_of_five_graph_data = Data(x=rdkit_rule_of_five_features_standardized_tensors, edge_index=edge_index, edge_type=edge_type, y=y)
rdkit_rule_of_five_graph_data

Data(x=[740, 162], edge_index=[2, 1480], y=[740], edge_type=[1480])

In [253]:
type(rdkit_rule_of_five_graph_data)

torch_geometric.data.data.Data

### 7. Tf-idf, Rdkit, and Rule of five as node features

In [254]:
# Convert numpy array features into tensor format

tfidf_rdkit_rule_of_five_features_standardized_tensors = torch.tensor(tfidf_rdkit_rule_of_five_features_standardized, dtype=torch.float)
tfidf_rdkit_rule_of_five_features_standardized_tensors

tensor([[ 0.3046, -0.2311, -0.1940,  ..., -2.2114,  0.4950,  0.3282],
        [ 0.3046, -0.2311, -0.1940,  ..., -2.2114,  0.4950,  0.3282],
        [ 0.2765, -0.2311, -0.2210,  ..., -1.9489,  0.4950,  0.0208],
        ...,
        [-1.3636, -0.2311,  2.2702,  ..., -2.2836,  2.3411,  1.8652],
        [-1.5743, -0.2311,  1.3064,  ..., -2.2836,  2.3411,  1.8652],
        [-0.1683, -0.2311, -0.0903,  ...,  0.8408, -0.6127, -0.9015]])

In [255]:
type(tfidf_rdkit_rule_of_five_features_standardized_tensors)

torch.Tensor

In [256]:
tfidf_rdkit_rule_of_five_features_standardized_tensors.shape

torch.Size([740, 214])

In [257]:
tfidf_rdkit_rule_of_five_graph_data = Data(x=tfidf_rdkit_rule_of_five_features_standardized_tensors, edge_index=edge_index, edge_type=edge_type, y=y)
tfidf_rdkit_rule_of_five_graph_data

Data(x=[740, 214], edge_index=[2, 1480], y=[740], edge_type=[1480])

In [258]:
type(tfidf_rdkit_rule_of_five_graph_data)

torch_geometric.data.data.Data

### 8. Tf-idf, Rdkit, and Mordred as node features

In [259]:
# Convert numpy array features into tensor format

tfidf_rdkit_mordred_final_features_standardized_tensors = torch.tensor(tfidf_rdkit_mordred_final_features_standardized, dtype=torch.float)
tfidf_rdkit_mordred_final_features_standardized_tensors

tensor([[ 0.3046, -0.2311, -0.1940,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.3046, -0.2311, -0.1940,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.2765, -0.2311, -0.2210,  ...,  0.0000,  0.0000,  0.0000],
        ...,
        [-1.3636, -0.2311,  2.2702,  ...,  0.0000,  0.0000,  0.0000],
        [-1.5743, -0.2311,  1.3064,  ...,  0.0000,  0.0000,  0.0000],
        [-0.1683, -0.2311, -0.0903,  ...,  0.0000,  0.0000,  0.0000]])

In [260]:
type(tfidf_rdkit_mordred_final_features_standardized_tensors)

torch.Tensor

In [261]:
tfidf_rdkit_mordred_final_features_standardized_tensors.shape

torch.Size([740, 3436])

In [262]:
tfidf_rdkit_mordred_graph_data = Data(x=tfidf_rdkit_mordred_final_features_standardized_tensors, edge_index=edge_index, edge_type=edge_type, y=y)
tfidf_rdkit_mordred_graph_data

Data(x=[740, 3436], edge_index=[2, 1480], y=[740], edge_type=[1480])

In [263]:
type(tfidf_rdkit_mordred_graph_data)

torch_geometric.data.data.Data

### Save the data

In [264]:
#torch.save(tfidf_graph_data, 'tfidf_graph_data.pt')

In [265]:
#torch.save(mordred_graph_data, 'mordred_graph_data.pt')

In [266]:
#torch.save(rdkit_graph_data, 'rdkit_graph_data.pt')

In [267]:
#torch.save(rule_of_five_graph_data, 'rule_of_five_graph_data.pt')

In [268]:
#torch.save(rdkit_mordred_graph_data, 'rdkit_mordred_graph_data.pt')

In [269]:
#torch.save(rdkit_rule_of_five_graph_data, 'rdkit_rule_of_five_graph_data.pt')

In [270]:
#torch.save(tfidf_rdkit_rule_of_five_graph_data, 'tfidf_rdkit_rule_of_five_graph_data.pt')

In [271]:
#torch.save(tfidf_rdkit_mordred_graph_data, 'tfidf_rdkit_mordred_graph_data.pt')