In [1]:
import matplotlib.pyplot as plt

import h5py
import numpy as np
import pandas as pd
import torch

from collections import Counter

from torch.utils.data import ConcatDataset, DataLoader
from torch_geometric.data import DataListLoader

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
from IPython.display import clear_output
from IPython.core.display import HTML
HTML("<style>.container { width:98% !important; }</style>")

In [3]:
os.chdir("..")
print("Current working directory:", os.getcwd())

Current working directory: /g/g92/noh1/fast2


In [4]:
data_dir = "./data/gmd"
file_name = "gmd_fast2_input.h5"
file_path = os.path.join(data_dir, file_name)
print(os.path.exists(file_path))

True


In [5]:
f = h5py.File(file_path, 'r')

In [6]:
data_names = list(f)

In [7]:
len(data_names)

410

In [8]:
data_names[:15]

['CHEMBL1204124_protease2_1',
 'CHEMBL1204124_protease2_10',
 'CHEMBL1204124_protease2_2',
 'CHEMBL1204124_protease2_3',
 'CHEMBL1204124_protease2_4',
 'CHEMBL1204124_protease2_5',
 'CHEMBL1204124_protease2_6',
 'CHEMBL1204124_protease2_7',
 'CHEMBL1204124_protease2_8',
 'CHEMBL1204124_protease2_9',
 'CHEMBL1204920_protease2_1',
 'CHEMBL1204920_protease2_10',
 'CHEMBL1204920_protease2_2',
 'CHEMBL1204920_protease2_3',
 'CHEMBL1204920_protease2_4']

In [9]:
idx = 1
name = data_names[idx]
# print(f[name]["spatial"].attrs["affinity"])

In [10]:
f.attrs.keys()

<KeysViewHDF5 []>

In [11]:
# attr_names = list(f.attrs.keys())
# for attr_name in attr_names:
#     attr_value = f.attrs[attr_name]
#     print(f"Attribute '{attr_name}': {attr_value}")

for k in f[name].keys():
    print(k)
    for info in f[name][k]:
        print("  ", f[name][k][info])

spatial
   <HDF5 dataset "coords": shape (440, 3), type "<f4">
   <HDF5 dataset "dists": shape (440, 440), type "<f4">
   <HDF5 dataset "node_feats": shape (440, 20), type "<f4">


In [12]:
f[name]["spatial"]["coords"][:]

array([[-0.04844284,  0.6784439 , -0.6302967 ],
       [ 1.0345573 , -1.2185555 ,  2.6367035 ],
       [-1.1584435 , -0.15955544,  3.0217037 ],
       ...,
       [-5.0784454 , -7.2135563 , 17.106703  ],
       [-5.1464424 , -8.748556  , 17.001703  ],
       [-4.162445  , -9.310556  , 16.037703  ]], dtype=float32)

In [13]:
f[name]["spatial"]["node_feats"][:]

array([[1.7 , 0.  , 1.  , ..., 0.  , 0.  , 0.  ],
       [1.7 , 0.  , 1.  , ..., 0.  , 0.  , 1.  ],
       [1.7 , 0.  , 1.  , ..., 0.  , 0.  , 1.  ],
       ...,
       [1.7 , 0.  , 1.  , ..., 0.  , 0.  , 0.  ],
       [1.7 , 0.  , 1.  , ..., 0.  , 0.  , 0.  ],
       [1.55, 0.  , 0.  , ..., 1.  , 1.  , 0.  ]], dtype=float32)

In [14]:
f[name]["spatial"]["dists"][:]

array([[ 0.       ,  3.9299855,  3.9078705, ..., 20.05457  , 20.633598 ,
        19.862713 ],
       [ 3.9299855,  0.       ,  2.4655545, ..., 16.81338  , 17.356812 ,
        16.49473  ],
       [ 3.9078705,  2.4655545,  0.       , ..., 16.23307  , 16.885363 ,
        16.192007 ],
       ...,
       [20.05457  , 16.81338  , 16.23307  , ...,  0.       ,  1.5400887,
         2.5257132],
       [20.633598 , 17.356812 , 16.885363 , ...,  1.5400887,  0.       ,
         1.4877474],
       [19.862713 , 16.49473  , 16.192007 , ...,  2.5257132,  1.4877474,
         0.       ]], dtype=float32)

In [15]:
x = np.concatenate((f[name]["spatial"]["coords"][:], f[name]["spatial"]["node_feats"][:]), axis=-1)
x.shape

(440, 23)

In [16]:
x

array([[-0.04844284,  0.6784439 , -0.6302967 , ...,  0.        ,
         0.        ,  0.        ],
       [ 1.0345573 , -1.2185555 ,  2.6367035 , ...,  0.        ,
         0.        ,  1.        ],
       [-1.1584435 , -0.15955544,  3.0217037 , ...,  0.        ,
         0.        ,  1.        ],
       ...,
       [-5.0784454 , -7.2135563 , 17.106703  , ...,  0.        ,
         0.        ,  0.        ],
       [-5.1464424 , -8.748556  , 17.001703  , ...,  0.        ,
         0.        ,  0.        ],
       [-4.162445  , -9.310556  , 16.037703  , ...,  1.        ,
         1.        ,  0.        ]], dtype=float32)

In [17]:
def get_compound_id(docking_id):
    return docking_id.rsplit("_", 2)[0]

compound_ids = set([get_compound_id(docking_id) for docking_id in data_names])
print(list(compound_ids)[:10])
len(compound_ids)

['CHEMBL436136', 'CHEMBL3357118', 'CHEMBL436993', 'CHEMBL434334', 'CHEMBL4520950', 'CHEMBL431724', 'CHEMBL433967', 'CHEMBL436353', 'CHEMBL433975', 'CHEMBL1205015']


41